Rumah >pembangunan bahagian belakang >tutorial php >Greenplum创建表--分布键_PHP教程
Greenplum默认使用hash分布策略。该策略可选一个或者多个列作为分布键(distribution key,简称DK)。分布键做hash算法来确认数据存放到对应的segment上。相同分布键值会hash到相同的segment上。表上最好有唯一键或者主键,这样能保证数据均衡分不到各个segment上。语法,distributed by。
数据会被随机分不到segment上,相同记录可能会存放在不同的segment上。随机分布可以保证数据平均,但是Greenplum没有跨节点的唯一键约束数据,所以无法保证数据唯一。基于唯一性和性能考虑,推荐使用hash分布,性能部分会另开一篇文档详细介绍。语法,distributed randomly。
testDB=# create table t_hash(id int,name varchar(50)) distributed by (id); CREATE TABLE testDB=# testDB=# \d t_hash Table "public.t_hash" Column | Type | Modifiers --------+-----------------------+----------- id | integer | name | character varying(50) | Distributed by: (id)
testDB=# alter table t_hash add primary key (name); NOTICE: updating distribution policy to match new primary key NOTICE: ALTER TABLE / ADD PRIMARY KEY will create implicit index "t_hash_pkey" for table "t_hash" ALTER TABLE testDB=# \d t_hash Table "public.t_hash" Column | Type | Modifiers --------+-----------------------+----------- id | integer | name | character varying(50) | not null Indexes: "t_hash_pkey" PRIMARY KEY, btree (name) Distributed by: (name)
testDB=# insert into t_hash values(1,'szlsd1'); INSERT 0 1 testDB=# testDB=# insert into t_hash values(2,'szlsd1'); ERROR: duplicate key violates unique constraint "t_hash_pkey"(seg2 gp-s3:40000 pid=3855)
testDB=# create unique index u_id on t_hash(name); CREATE INDEX testDB=# testDB=# testDB=# \d t_hash Table "public.t_hash" Column | Type | Modifiers --------+-----------------------+----------- id | integer | name | character varying(50) | not null Indexes: "t_hash_pkey" PRIMARY KEY, btree (name) "u_id" UNIQUE, btree (name) Distributed by: (name)
testDB=# create unique index uk_id on t_hash(id); ERROR: UNIQUE index must contain all columns in the distribution key of relation "t_hash" testDB=# create unique index uk_id on t_hash(id,name); CREATE INDEX testDB=# \d t_hash Table "public.t_hash" Column | Type | Modifiers --------+-----------------------+----------- id | integer | name | character varying(50) | not null Indexes: "t_hash_pkey" PRIMARY KEY, btree (name) "uk_id" UNIQUE, btree (id, name) Distributed by: (name)
testDB=# alter table t_hash drop constraint t_hash_pkey; ALTER TABLE testDB=# \d t_hash Table "public.t_hash" Column | Type | Modifiers --------+-----------------------+----------- id | integer | name | character varying(50) | not null Distributed by: (name)
testDB=# insert into t_hash values(1,'szlsd'); INSERT 0 1 testDB=# insert into t_hash values(2,'szlsd'); INSERT 0 1 testDB=# insert into t_hash values(3,'szlsd'); INSERT 0 1 testDB=# insert into t_hash values(4,'szlsd'); INSERT 0 1 testDB=# insert into t_hash values(5,'szlsd'); INSERT 0 1 testDB=# insert into t_hash values(6,'szlsd'); INSERT 0 1 testDB=# testDB=# testDB=# select gp_segment_id,count(*) from t_hash group by gp_segment_id; gp_segment_id | count ---------------+------- 2 | 7 (1 row)
创建随机分布表需加distributed randomly关键字,具体使用哪列作为分布键不得而知。
testDB=# create table t_random(id int ,name varchar(100)) distributed randomly; CREATE TABLE testDB=# testDB=# testDB=# \d t_random Table "public.t_random" Column | Type | Modifiers --------+------------------------+----------- id | integer | name | character varying(100) | Distributed randomly
testDB=# alter table t_random add primary key (id,name); ERROR: PRIMARY KEY and DISTRIBUTED RANDOMLY are incompatible testDB=# testDB=# create unique index uk_r_id on t_random(id); ERROR: UNIQUE and DISTRIBUTED RANDOMLY are incompatible testDB=#
testDB=# insert into t_random values(1,'szlsd3'); INSERT 0 1 testDB=# select gp_segment_id,count(*) from t_random group by gp_segment_id; gp_segment_id | count ---------------+------- 1 | 1 (1 row) testDB=# testDB=# insert into t_random values(1,'szlsd3'); INSERT 0 1 testDB=# select gp_segment_id,count(*) from t_random group by gp_segment_id; gp_segment_id | count ---------------+------- 2 | 1 1 | 1 (2 rows) testDB=# insert into t_random values(1,'szlsd3'); INSERT 0 1 testDB=# select gp_segment_id,count(*) from t_random group by gp_segment_id; gp_segment_id | count ---------------+------- 2 | 1 1 | 2 (2 rows) testDB=# insert into t_random values(1,'szlsd3'); INSERT 0 1 testDB=# select gp_segment_id,count(*) from t_random group by gp_segment_id; gp_segment_id | count ---------------+------- 2 | 2 1 | 2 (2 rows) testDB=# insert into t_random values(1,'szlsd3'); INSERT 0 1 testDB=# select gp_segment_id,count(*) from t_random group by gp_segment_id; gp_segment_id | count ---------------+------- 2 | 2 1 | 3 (2 rows) testDB=# insert into t_random values(1,'szlsd3'); INSERT 0 1 testDB=# select gp_segment_id,count(*) from t_random group by gp_segment_id; gp_segment_id | count ---------------+------- 2 | 2 1 | 3 0 | 1 (3 rows)
testDB=# \d t_hash; Table "public.t_hash" Column | Type | Modifiers --------+-----------------------+----------- id | integer | name | character varying(50) | not null Indexes: "t_hash_pkey" PRIMARY KEY, btree (name) "uk_id" UNIQUE, btree (id, name) Distributed by: (name) testDB=# testDB=# testDB=# create table t_hash_1 as select * from t_hash; NOTICE: Table doesn't have 'DISTRIBUTED BY' clause -- Using column(s) named 'name' as the Greenplum Database data distribution key for this table. HINT: The 'DISTRIBUTED BY' clause determines the distribution of data. Make sure column(s) chosen are the optimal data distribution key to minimize skew. SELECT 0 testDB=# \d t_hash_1 Table "public.t_hash_1" Column | Type | Modifiers --------+-----------------------+----------- id | integer | name | character varying(50) | Distributed by: (name) testDB=# testDB=# create table t_hash_2 (like t_hash); NOTICE: Table doesn't have 'distributed by' clause, defaulting to distribution columns from LIKE table CREATE TABLE testDB=# \d t_hash_2 Table "public.t_hash_2" Column | Type | Modifiers --------+-----------------------+----------- id | integer | name | character varying(50) | not null Distributed by: (name)
如果CTAS创建表改变分布键,加上distributed by即可。
testDB=# create table t_hash_3 as select * from t_hash distributed by (id); SELECT 0 testDB=# testDB=# \d t_hash_3 Table "public.t_hash_3" Column | Type | Modifiers --------+-----------------------+----------- id | integer | name | character varying(50) | Distributed by: (id) testDB=# testDB=# testDB=# create table t_hash_4 (like t_hash) distributed by (id); CREATE TABLE testDB=# testDB=# \d t_hash4 Did not find any relation named "t_hash4". testDB=# \d t_hash_4 Table "public.t_hash_4" Column | Type | Modifiers --------+-----------------------+----------- id | integer | name | character varying(50) | not null Distributed by: (id)
CTAS时,randomly随机分布键要特别注意,一定要加上distributed randomly,不然原表是hash分布键,CTAS新表则是随机分布键。
testDB=# \d t_random Table "public.t_random" Column | Type | Modifiers --------+------------------------+----------- id | integer | name | character varying(100) | Distributed randomly testDB=# testDB=# \d t_random_1 Table "public.t_random_1" Column | Type | Modifiers --------+------------------------+----------- id | integer | name | character varying(100) | Distributed by: (id)
testDB=# create table t_random_2 as select * from t_random distributed randomly; SELECT 7 testDB=# testDB=# \d t_random_2 Table "public.t_random_2" Column | Type | Modifiers --------+------------------------+----------- id | integer | name | character varying(100) | Distributed randomly