2014-02-15 16:15:16

Impala分区学习

1.分区表的创建示例:

(1)创建分区表

create table logs (field1 string, field2 string, field3 string)
partitioned by (year string, month string , day string, host string)
row format delimited fields terminated by ',';

(2)插入数据到新建的分区

insert into logs partition(year="2013", month="07",day="28",host="host1") values
("foo","foo","foo");
insert into logs partition(year="2013", month="07",day="28",host="host2") values
("foo","foo","foo");
insert into logs partition(year="2013", month="07",day="29",host="host1") values
("foo","foo","foo");
insert into logs partition(year="2013", month="07",day="29",host="host2") values
("foo","foo","foo");
insert into logs partition(year="2013", month="08",day="01",host="host1") values

(3)查看相应的hdfs目录:

[root@dcnode6 ~]# hadoop dfs -ls /user/hive/warehouse/gj.db/logs
DEPRECATED: Use of this script to execute hdfs command is deprecated.
Instead use the hdfs command for it.
Found 1 items
drwxr-xr-x   - impala supergroup          0 2014-02-25 15:16 /user/hive/warehouse/gj.db/logs/year=2013
[root@dcnode6 ~]# hadoop dfs -ls /user/hive/warehouse/gj.db/logs/year=2013
DEPRECATED: Use of this script to execute hdfs command is deprecated.
Instead use the hdfs command for it.
Found 2 items
drwxr-xr-x   - impala supergroup          0 2014-02-25 15:16 /user/hive/warehouse/gj.db/logs/year=2013/month=07
drwxr-xr-x   - impala supergroup          0 2014-02-25 15:16 /user/hive/warehouse/gj.db/logs/year=2013/month=08

(4)为外部表建立分区,建立相关目录,与上面不一样:

hdfs dfs -mkdir -p /user/root/data/logs/year=2013/month=07/day=28/host=host1
hdfs dfs -mkdir -p /user/root/data/logs/year=2013/month=07/day=28/host=host2
hdfs dfs -mkdir -p /user/root/data/logs/year=2013/month=07/day=28/host=host1
hdfs dfs -mkdir -p /user/root/data/logs/year=2013/month=07/day=29/host=host1
hdfs dfs -mkdir -p /user/root/data/logs/year=2013/month=08/day=01/host=host1

(5)上传数据文件到相应分区目录

$ cat dummy_log_data
bar,baz,bletch
hdfs dfs -put dummy_log_data /user/root/data/logs/year=2013/month=07/day=28/host=host1
hdfs dfs -put dummy_log_data /user/root/data/logs/year=2013/month=07/day=28/host=host2
hdfs dfs -put dummy_log_data /user/root/data/logs/year=2013/month=07/day=29/host=host1
hdfs dfs -put dummy_log_data /user/root/data/logs/year=2013/month=08/day=01/host=host1

(6)建立一个新的外部表,指向刚才建立的目录

alter table logs rename to logs_original;
create external table logs (field1 string, field2 string, field3 string)
partitioned by (year string, month string, day string, host string)
row format delimited fields terminated by ','
location '/user/root/data/logs';

(7)添加分区,识别分区目录

Because partition subdirectories and data files come and go during the data lifecycle, you must identify each of

the partitions through an ALTERTABLE statement before Impala recognizes the data files they contain.

alter table logs add partition (year="2013",month="07",day="28",host="host1");
alter table logs add partition (year="2013",month="07",day="28",host="host2");
alter table logs add partition (year="2013",month="07",day="29",host="host1");
alter table logs add partition (year="2013",month="08",day="01",host="host1");

(8)进行查询:

[dcnode6:21000] >  select * from logs;
Query: select * from logs
Query finished, fetching results ...
+--------+--------+--------+------+-------+-----+-------+
| field1 | field2 | field3 | year | month | day | host  |
+--------+--------+--------+------+-------+-----+-------+
| bar    | baz    | bletch | 2013 | 08    | 01  | host1 |
| bar    | baz    | bletch | 2013 | 07    | 28  | host1 |
| bar    | baz    | bletch | 2013 | 07    | 29  | host1 |
| bar    | baz    | bletch | 2013 | 07    | 28  | host2 |
+--------+--------+--------+------+-------+-----+-------+
Returned 4 row(s) in 0.39s

2.impala为表创建分区实战:

分区要求,按照天和小时分区,依据表中的时间字段分区

(1)创建外部表的分区目录

注意写法,例如:1不要写成01

hdfs dfs -mkdir -p /xtld/data/gj/zlh/uni/dd=20130618/hh=0
hdfs dfs -mkdir -p /xtld/data/gj/zlh/uni/dd=20130618/hh=1
hdfs dfs -mkdir -p /xtld/data/gj/zlh/uni/dd=20130618/hh=2
hdfs dfs -mkdir -p /xtld/data/gj/zlh/uni/dd=20130618/hh=3
hdfs dfs -mkdir -p /xtld/data/gj/zlh/uni/dd=20130618/hh=4
hdfs dfs -mkdir -p /xtld/data/gj/zlh/uni/dd=20130618/hh=5
hdfs dfs -mkdir -p /xtld/data/gj/zlh/uni/dd=20130618/hh=6
hdfs dfs -mkdir -p /xtld/data/gj/zlh/uni/dd=20130618/hh=7
hdfs dfs -mkdir -p /xtld/data/gj/zlh/uni/dd=20130618/hh=8
hdfs dfs -mkdir -p /xtld/data/gj/zlh/uni/dd=20130618/hh=9
hdfs dfs -mkdir -p /xtld/data/gj/zlh/uni/dd=20130618/hh=10
hdfs dfs -mkdir -p /xtld/data/gj/zlh/uni/dd=20130618/hh=11
hdfs dfs -mkdir -p /xtld/data/gj/zlh/uni/dd=20130618/hh=12
hdfs dfs -mkdir -p /xtld/data/gj/zlh/uni/dd=20130618/hh=13
hdfs dfs -mkdir -p /xtld/data/gj/zlh/uni/dd=20130618/hh=14
hdfs dfs -mkdir -p /xtld/data/gj/zlh/uni/dd=20130618/hh=15
hdfs dfs -mkdir -p /xtld/data/gj/zlh/uni/dd=20130618/hh=16
hdfs dfs -mkdir -p /xtld/data/gj/zlh/uni/dd=20130618/hh=17
hdfs dfs -mkdir -p /xtld/data/gj/zlh/uni/dd=20130618/hh=18
hdfs dfs -mkdir -p /xtld/data/gj/zlh/uni/dd=20130618/hh=19
hdfs dfs -mkdir -p /xtld/data/gj/zlh/uni/dd=20130618/hh=20
hdfs dfs -mkdir -p /xtld/data/gj/zlh/uni/dd=20130618/hh=21
hdfs dfs -mkdir -p /xtld/data/gj/zlh/uni/dd=20130618/hh=22
hdfs dfs -mkdir -p /xtld/data/gj/zlh/uni/dd=20130618/hh=23

(2)移动相应的数据文件到各个分区目录:

hadoop dfs -mv /xtld/data/gj/zlh/2013061802*  /xtld/data/gj/zlh/uni/dd=20130618/hh=2
hadoop dfs -mv /xtld/data/gj/zlh/2013061803*  /xtld/data/gj/zlh/uni/dd=20130618/hh=3
hadoop dfs -mv /xtld/data/gj/zlh/2013061804*  /xtld/data/gj/zlh/uni/dd=20130618/hh=4
hadoop dfs -mv /xtld/data/gj/zlh/2013061805*  /xtld/data/gj/zlh/uni/dd=20130618/hh=5

(3)建立外部表,指定分区根目录:

create external table uni
(
PHONE_CODE    string,
LAC           string,
CI            string,
IMEI          string,
FLOW_TYPE     string,
BTIME         TIMESTAMP,
ETIME         TIMESTAMP,
DURATION      bigint,
UP_FLOW       bigint,
DOWN_FLOW     bigint,
SUM_FLOW      bigint,
NET_TYPE      string,
SRC_IP        string,
DST_IP        string,
STATE_FLAG    string,
USER_AGENT    string,
APN           string,
IMSI          string,
SGSN_IP       string,
GGSN_IP       string,
CONTENT_TYPE  string,
SRC_PORT      bigint,
DST_PORT      bigint,
REC_FLAG      string,
REC_NUM       bigint,
URL_SIG       string
)
partitioned by (dd int, hh int)
ROW FORMAT DELIMITED FIELDS TERMINATED BY '|'
location '/xtld/data/gj/zlh/uni';

(4)手动添加分区,让表可以识别各个分区下的文件

alter table uni add partition (dd=20130618,hh=0);
alter table uni add partition (dd=20130618,hh=1);
alter table uni add partition (dd=20130618,hh=2);
alter table uni add partition (dd=20130618,hh=3);
alter table uni add partition (dd=20130618,hh=4);
alter table uni add partition (dd=20130618,hh=5);
alter table uni add partition (dd=20130618,hh=6);
alter table uni add partition (dd=20130618,hh=7);
alter table uni add partition (dd=20130618,hh=8);
alter table uni add partition (dd=20130618,hh=9);
alter table uni add partition (dd=20130618,hh=10);
alter table uni add partition (dd=20130618,hh=11);
alter table uni add partition (dd=20130618,hh=12);
alter table uni add partition (dd=20130618,hh=13);
alter table uni add partition (dd=20130618,hh=14);
alter table uni add partition (dd=20130618,hh=15);
alter table uni add partition (dd=20130618,hh=16);
alter table uni add partition (dd=20130618,hh=17);
alter table uni add partition (dd=20130618,hh=18);
alter table uni add partition (dd=20130618,hh=19);
alter table uni add partition (dd=20130618,hh=20);
alter table uni add partition (dd=20130618,hh=21);
alter table uni add partition (dd=20130618,hh=22);
alter table uni add partition (dd=20130618,hh=23);

(5)测试:

[dcnode6:21000] > select count(*) from uni where PHONE_CODE='73910553410' and dd=20130618 and hh=02;
Query: select count(*) from uni where PHONE_CODE='73910553410' and dd=20130618 and hh=02
Query finished, fetching results ...
+----------+
| count(*) |
+----------+
| 1170     |
+----------+
发表回复