本文共 11431 字,大约阅读时间需要 38 分钟。
在Hive中 系统给我们内置了很多函数 具体参考
SHOW FUNCTIONS; --查看所有内置函数DESCRIBE FUNCTION; --查看某个函数的描述DESCRIBE FUNCTION EXTENDED ; --查看某个函数的具体使用方法
hive> DESCRIBE FUNCTION case;OKCASE a WHEN b THEN c [WHEN d THEN e]* [ELSE f] END - When a = b, returns c; when a = d, return e; else return fTime taken: 0.006 seconds, Fetched: 1 row(s)hive> DESCRIBE FUNCTION EXTENDED case;OKCASE a WHEN b THEN c [WHEN d THEN e]* [ELSE f] END - When a = b, returns c; when a = d, return e; else return fExample: SELECT CASE deptno WHEN 1 THEN Engineering WHEN 2 THEN Finance ELSE admin END, CASE zone WHEN 7 THEN Americas ELSE Asia-Pac END FROM emp_detailsTime taken: 0.008 seconds, Fetched: 13 row(s)# DESCRIBE 可简写为deschive> desc FUNCTION EXTENDED case;OKCASE a WHEN b THEN c [WHEN d THEN e]* [ELSE f] END - When a = b, returns c; when a = d, return e; else return fExample: SELECT CASE deptno WHEN 1 THEN Engineering WHEN 2 THEN Finance ELSE admin END, CASE zone WHEN 7 THEN Americas ELSE Asia-Pac END FROM emp_detailsTime taken: 0.009 seconds, Fetched: 13 row(s)
下面我们了解下常用函数的使用方法:
# 为了方便测试 我们创建常用的dual表hive> create table dual(x string);OKTime taken: 0.11 secondshive> insert into table dual values('');Query ID = hadoop_20180702100505_f0566585-06b2-4c53-910a-b6a58791fc2dTotal jobs = 3Launching Job 1 out of 3...OKTime taken: 29.535 secondshive> select * from dual;OKTime taken: 0.147 seconds, Fetched: 1 row(s)# 测试当前时间hive> select current_date from dual;OK2018-07-02Time taken: 0.111 seconds, Fetched: 1 row(s)# 测试当前时间戳hive> select current_timestamp from dual;OK2018-07-02 15:03:28.919Time taken: 0.117 seconds, Fetched: 1 row(s)# 测试substr函数 用于截取字符串hive> desc function extended substr;OKsubstr(str, pos[, len]) - returns the substring of str that starts at pos and is of length len orsubstr(bin, pos[, len]) - returns the slice of byte array that starts at pos and is of length lenSynonyms: substringpos is a 1-based index. If pos<0 the starting position is determined by counting backwards from the end of str.Example: > SELECT substr('Facebook', 5) FROM src LIMIT 1; 'book' > SELECT substr('Facebook', -5) FROM src LIMIT 1; 'ebook' > SELECT substr('Facebook', 5, 1) FROM src LIMIT 1; 'b'Time taken: 0.016 seconds, Fetched: 10 row(s)hive> SELECT substr('helloworld',-5) FROM dual;OKworldTime taken: 0.171 seconds, Fetched: 1 row(s)hive> SELECT substr('helloworld',5) FROM dual;OKoworldTime taken: 0.12 seconds, Fetched: 1 row(s)hive> SELECT substr('helloworld',5,3) FROM dual;OKowoTime taken: 0.142 seconds, Fetched: 1 row(s)# 测试函数concat 用于将字符连接起来hive> desc function extended concat_ws;OKconcat_ws(separator, [string | array(string)]+) - returns the concatenation of the strings separated by the separator.Example: > SELECT concat_ws('.', 'www', array('facebook', 'com')) FROM src LIMIT 1; 'www.facebook.com'Time taken: 0.019 seconds, Fetched: 4 row(s)hive> select concat_ws(".","192","168","199","151") from dual;OK192.168.199.151Time taken: 0.152 seconds, Fetched: 1 row(s)# 测试函数split 用于拆分hive> desc function extended split;OKsplit(str, regex) - Splits str around occurances that match regexExample: > SELECT split('oneAtwoBthreeC', '[ABC]') FROM src LIMIT 1; ["one", "two", "three"]Time taken: 0.021 seconds, Fetched: 4 row(s)hive> select split("192.168.199.151","\\.") from dual;OK["192","168","199","151"]Time taken: 0.169 seconds, Fetched: 1 row(s)
# 简单select语法hive> select * from emp where deptno=10;OK7782 CLARK MANAGER 7839 1981-6-9 2450.0 NULL 107839 KING PRESIDENT NULL 1981-11-17 5000.0 NULL 107934 MILLER CLERK 7782 1982-1-23 1300.0 NULL 10Time taken: 0.899 seconds, Fetched: 3 row(s)hive> select * from emp where empno <= 7800;OK7369 SMITH CLERK 7902 1980-12-17 800.0 NULL 207499 ALLEN SALESMAN 7698 1981-2-20 1600.0 300.0 307521 WARD SALESMAN 7698 1981-2-22 1250.0 500.0 307566 JONES MANAGER 7839 1981-4-2 2975.0 NULL 207654 MARTIN SALESMAN 7698 1981-9-28 1250.0 1400.0 307698 BLAKE MANAGER 7839 1981-5-1 2850.0 NULL 307782 CLARK MANAGER 7839 1981-6-9 2450.0 NULL 107788 SCOTT ANALYST 7566 1987-4-19 3000.0 NULL 20Time taken: 0.277 seconds, Fetched: 8 row(s)hive> select * from emp where salary between 1000 and 1500;OK7521 WARD SALESMAN 7698 1981-2-22 1250.0 500.0 307654 MARTIN SALESMAN 7698 1981-9-28 1250.0 1400.0 307844 TURNER SALESMAN 7698 1981-9-8 1500.0 0.0 307876 ADAMS CLERK 7788 1987-5-23 1100.0 NULL 207934 MILLER CLERK 7782 1982-1-23 1300.0 NULL 10Time taken: 0.187 seconds, Fetched: 5 row(s)hive> select * from emp limit 5;OK7369 SMITH CLERK 7902 1980-12-17 800.0 NULL 207499 ALLEN SALESMAN 7698 1981-2-20 1600.0 300.0 307521 WARD SALESMAN 7698 1981-2-22 1250.0 500.0 307566 JONES MANAGER 7839 1981-4-2 2975.0 NULL 207654 MARTIN SALESMAN 7698 1981-9-28 1250.0 1400.0 30Time taken: 0.154 seconds, Fetched: 5 row(s)hive> select * from emp where empno in(7566,7499);OK7499 ALLEN SALESMAN 7698 1981-2-20 1600.0 300.0 307566 JONES MANAGER 7839 1981-4-2 2975.0 NULL 20Time taken: 0.153 seconds, Fetched: 2 row(s)hive> select * from emp where comm is not null;OK7499 ALLEN SALESMAN 7698 1981-2-20 1600.0 300.0 307521 WARD SALESMAN 7698 1981-2-22 1250.0 500.0 307654 MARTIN SALESMAN 7698 1981-9-28 1250.0 1400.0 307844 TURNER SALESMAN 7698 1981-9-8 1500.0 0.0 30Time taken: 0.291 seconds, Fetched: 4 row(s)
# 聚合函数及分组函数# max/min/count/sum/avg 特点:多进一出,进来很多条记录出去只有一条记录# 查询部门编号为10的有多少条记录hive> select count(1) from emp where deptno=10;Query ID = hadoop_20180703142525_af460dc7-287b-41b2-8af3-ba27cc0ea6ceTotal jobs = 1...OK3Time taken: 38.951 seconds, Fetched: 1 row(s)# 求最大工资,最小工资,平均工资,工资的和hive> select max(salary),min(salary),avg(salary),sum(salary) from emp;Query ID = hadoop_20180703142525_af460dc7-287b-41b2-8af3-ba27cc0ea6ceTotal jobs = 1...OK5000.0 800.0 2073.214285714286 29025.0Time taken: 23.748 seconds, Fetched: 1 row(s)# 分组函数 group by# 求部门的平均工资# 注:select中出现的字段,如果没有出现在组函数/聚合函数中,必须出现在group by里面,否则就会产生报错hive> select deptno,avg(salary) from emp group by deptno;Query ID = hadoop_20180703142525_af460dc7-287b-41b2-8af3-ba27cc0ea6ceTotal jobs = 1...OK10 2916.666666666666520 2175.030 1566.6666666666667Time taken: 36.502 seconds, Fetched: 3 row(s)# 求每个部门(deptno)、工作岗位(job)的最高工资(salary)hive> select deptno,job,max(salary) from emp group by deptno,job;Query ID = hadoop_20180703142525_af460dc7-287b-41b2-8af3-ba27cc0ea6ceTotal jobs = 1...OK10 CLERK 1300.010 MANAGER 2450.010 PRESIDENT 5000.020 ANALYST 3000.020 CLERK 1100.020 MANAGER 2975.030 CLERK 950.030 MANAGER 2850.030 SALESMAN 1600.0Time taken: 36.096 seconds, Fetched: 9 row(s)# 查询平均工资大于2000的部门(使用having子句限定分组查询)hive> select deptno,avg(salary) from emp group by deptno having avg(salary) >2000;Query ID = hadoop_20180703142525_af460dc7-287b-41b2-8af3-ba27cc0ea6ceTotal jobs = 1...OK10 2916.666666666666520 2175.0Time taken: 24.71 seconds, Fetched: 2 row(s)# case when then end(不会跑mr)hive> select ename, salary, > case > when salary > 1 and salary <= 1000 then 'LOWER' > when salary > 1000 and salary <= 2000 then 'MIDDLE' > when salary > 2000 and salary <= 4000 then 'HIGH' > ELSE 'HIGHEST' > end > from emp;OKSMITH 800.0 LOWERALLEN 1600.0 MIDDLEWARD 1250.0 MIDDLEJONES 2975.0 HIGHMARTIN 1250.0 MIDDLEBLAKE 2850.0 HIGHCLARK 2450.0 HIGHSCOTT 3000.0 HIGHKING 5000.0 HIGHESTTURNER 1500.0 MIDDLEADAMS 1100.0 MIDDLEJAMES 950.0 LOWERFORD 3000.0 HIGHMILLER 1300.0 MIDDLETime taken: 0.096 seconds, Fetched: 14 row(s)
# 创建测试表hive> create table a( > id int, name string > ) row format delimited fields terminated by '\t';OKTime taken: 0.311 secondshive> create table b( > id int, age int > ) row format delimited fields terminated by '\t';OKTime taken: 0.142 seconds# insert或load数据 最后表数据如下hive> select * from a;OK1 zhangsan2 lisi3 wangwuhive> select * from b;OK1 202 304 40Time taken: 0.2 seconds, Fetched: 3 row(s)# 内连接 inner join = join 仅列出表1和表2符合连接条件的数据hive> select a.id,a.name,b.age from a join b on a.id=b.id;Query ID = hadoop_20180703142525_af460dc7-287b-41b2-8af3-ba27cc0ea6ceTotal jobs = 1...OK1 zhangsan 202 lisi 30Time taken: 24.415 seconds, Fetched: 2 row(s)# 左外连接(left join) 以左边的为基准,左边的数据全部数据全部出现,如果没有出现就赋null值hive> select a.id,a.name,b.age from a left join b on a.id=b.id;Query ID = hadoop_20180703142525_af460dc7-287b-41b2-8af3-ba27cc0ea6ceTotal jobs = 1...OK1 zhangsan 202 lisi 303 wangwu NULLTime taken: 26.218 seconds, Fetched: 3 row(s)# 右外连接(right join) 以右表为基准hive> select a.id,a.name,b.age from a right join b on a.id=b.id;Query ID = hadoop_20180703142525_af460dc7-287b-41b2-8af3-ba27cc0ea6ceTotal jobs = 1...OK1 zhangsan 202 lisi 30NULL NULL 40Time taken: 24.027 seconds, Fetched: 3 row(s)# 全连接(full join)相当于表1和表2的数据都显示,如果没有对应的数据,则显示Null.hive> select a.id,a.name,b.age from a full join b on a.id=b.id;Query ID = hadoop_20180703142525_af460dc7-287b-41b2-8af3-ba27cc0ea6ceTotal jobs = 1...OK1 zhangsan 202 lisi 303 wangwu NULLNULL NULL 40Time taken: 32.94 seconds, Fetched: 4 row(s)# 笛卡尔积(cross join) 没有连接条件 会针对表1和表2的每条数据做连接hive> select a.id,a.name,b.age from a cross join b;Warning: Map Join MAPJOIN[7][bigTable=a] in task 'Stage-3:MAPRED' is a cross productQuery ID = hadoop_20180703142525_af460dc7-287b-41b2-8af3-ba27cc0ea6ceTotal jobs = 1...OK1 zhangsan 201 zhangsan 301 zhangsan 402 lisi 202 lisi 302 lisi 403 wangwu 203 wangwu 303 wangwu 40Time taken: 29.825 seconds, Fetched: 9 row(s)
# 创建表 加载测试数据hive> create table hive_wc(sentence string);OKTime taken: 0.149 seconds[hadoop@hadoop000 ~]$ cat hive-wc.txthello,world,welcomehello,welcomehive> load data local inpath '/home/hadoop/hive-wc.txt' into table hive_wc;Loading data to table default.hive_wcTable default.hive_wc stats: [numFiles=1, totalSize=34]OKTime taken: 0.729 secondshive> select * from hive_wc;OKhello,world,welcomehello,welcomeTime taken: 0.13 seconds, Fetched: 2 row(s)# 获取每个单词 利用split分割hive> select split(sentence,",") from hive_wc;OK["hello","world","welcome"]["hello","welcome"]Time taken: 0.163 seconds, Fetched: 2 row(s)# explode把数组转成多行 结合split使用如下hive> select explode(split(sentence,",")) from hive_wc;OKhelloworldwelcomehellowelcomeTime taken: 0.068 seconds, Fetched: 5 row(s)# 做group by操作 一条语句即可实现wordcount统计hive> select word, count(1) as c > from (select explode(split(sentence,",")) as word from hive_wc) t > group by word ;Query ID = hadoop_20180703142525_af460dc7-287b-41b2-8af3-ba27cc0ea6ceTotal jobs = 1...OKhello 2welcome 2world 1Time taken: 34.168 seconds, Fetched: 3 row(s)
转载于:https://blog.51cto.com/10814168/2135601