ICode9

精准搜索请尝试: 精确搜索
首页 > 其他分享> 文章详细

hive数仓构建

2019-09-01 15:08:59  阅读:266  来源: 互联网

标签:数仓 ol string dim hive 构建 browser id name


原文链接:https://blog.csdn.net/myloveprogrmming/article/details/85813114

转载自:https://blog.csdn.net/myloveprogrmming/article/details/85813114
1、获取数据源
后台的服务数据
前台的点击流日志数据
业务数据

2、确定主题
用户主题:用户、会员相关的信息
订单主题:订单相关的信息
浏览器主题:跟浏览器相关的信息
事件主题:跟事件相关

3、创建模型(创建表)
100张表以内不分层,以外将分层。
如果维度表较多则将维度表单独分层

1、创建维度层:
dim_维度表
地域维度:拆分成 省 市
浏览器维度:浏览器名称、版本
时间维度:周、季度
事件维度:category(种类)、action
平台维度:
kpi:(指标)

创建ods库:注:ods=operate database store
创建dw库:dw=data warehouse 数据仓库
创建dm库: dm=date manipulation数据操作

命名规则:
库名_主题_表名
ods_user_addr_
ODS_USER_ADDR_(大小写均可,但最好小写)
ods.user.addr不行
事实表
create database if not exists dim;//纬度
create database if not exists ods;//操作数据库存储
create database if not exists dw;//数据仓库
create database if not exists dm;//数据操作

在维度库下面创建维度表:注:字段带有特殊符号用这个解决,这里desc是关键字,所以用
CREATE TABLE IF NOT EXISTS dim_province (
id int,
province string,
country_id int,
desc string
)
row format delimited fields terminated by ‘\t’
;

CREATE TABLE IF NOT EXISTS dim_city (
id int,
city string,
desc string
)
row format delimited fields terminated by ‘\t’
;

CREATE TABLE IF NOT EXISTS dim_province_city (
dim_region_id bigint,
dim_region_city_name string,
dim_region_province_name string,
dim_region_country_name string,
dim_region_city_id string,
dim_region_province_id string,
dim_region_country_id string,
dim_region_date string
)
row format delimited fields terminated by ‘\t’
;

CREATE TABLE IF NOT EXISTS dim_platform (
id int,
platform_name string
)
row format delimited fields terminated by ‘\t’
;

CREATE TABLE IF NOT EXISTS dim_kpi (
id int,
kpi_name string
)
row format delimited fields terminated by ‘\t’
;

CREATE TABLE IF NOT EXISTS dim_event_name (
id int,
name string
)
row format delimited fields terminated by ‘\t’
;

CREATE TABLE IF NOT EXISTS dim_event_category (
id int,
category string
)
row format delimited fields terminated by ‘\t’
;

CREATE TABLE IF NOT EXISTS dim_event_action (
id int,
action string
)
row format delimited fields terminated by ‘\t’
;

CREATE TABLE IF NOT EXISTS dim_browser_name (
id int,
browser_name string,
browser_version_id int
)
row format delimited fields terminated by ‘\t’
;

CREATE TABLE IF NOT EXISTS dim_browser_version (
id int,
browser_version string
)
row format delimited fields terminated by ‘\t’
;

CREATE TABLE IF NOT EXISTS dim_order (
oid bigint,
on string,
cut_id bigint,
cua_id bigint,
browser_version string
)
row format delimited fields terminated by ‘\t’
;

CREATE TABLE IF NOT EXISTS dim_currency_type (
id int,
currency_name string
)
row format delimited fields terminated by ‘\t’
;

CREATE TABLE IF NOT EXISTS dim_payment_type (
id int,
payment_type string
)
row format delimited fields terminated by ‘\t’
;

CREATE TABLE IF NOT EXISTS dim_week(
STD_WEEK_CODE STRING,
STD_WEEK_NAME STRING,
BEGIN_DATE STRING,
END_DATE STRING,
NOTES STRING,
IS_DISPLAY INT,
DISPLAY_ORDER INT,
IS_VALID INT,
UPDATE_DATE STRING,
LAST_STD_WEEK_CODE STRING
)
row format delimited fields terminated by ‘\t’
;

CREATE TABLE IF NOT EXISTS dim_userinfo (
uid String,
uname string
)
row format delimited fields terminated by ‘\t’
;

load data local inpath ‘/root/dim/dim_province’ into table dim_province;
load data local inpath ‘/root/dim/dim_city’ into table dim_city;
load data local inpath ‘/root/dim/dim_province_city’ into table dim_province_city;
load data local inpath ‘/root/dim/dim_platform’ into table dim_platform;
load data local inpath ‘/root/dim/dim_kpi’ into table dim_kpi;
load data local inpath ‘/root/dim/dim_event_name’ into table dim_event_name;
load data local inpath ‘/root/dim/dim_browser_name’ into table dim_browser_name;
load data local inpath ‘/root/dim/dim_browser_version’ into table dim_browser_version;
load data local inpath ‘/root/dim/dim_userinfo’ into table dim_userinfo;


在ods层创建数据表:这里用到对元数据进行处理(UDF)
create table if not exists ods_font_log(
ip string,
ts string,
server_ip string,
url string
)
row format delimited fields terminated by ‘\u0001’
stored as textfile
;

create table if not exists ods_end_log(
ip string,
ts string,
server_ip string,
url string
)
row format delimited fields terminated by ‘\u0001’
stored as textfile
;

create table if not exists ods_logs(
ip string,
ts string,
server_ip string,
url string
)
row format delimited fields terminated by ‘\t’
stored as orc
;

insert into ods_logs
select
from ods_font_log
union all
select
from ods_end_log
;

udf:
ip
url解析
或者用mr清洗数据:

create table if not exists ods_logs(
ver string,
s_time string,
en string,
u_ud string,
u_mid string,
u_sd string,
c_time string,
l string,
b_iev string,
b_rst string,
p_url string,
p_ref string,
tt string,
pl string,
ip string,
oid string,
on string,
cua string,
cut string,
pt string,
ca string,
ac string,
kv_ string,
du string,
browser_name string,
browser_version string,
os_name string,
os_version string,
country string,
province string,
city string
)
row format delimited fields terminated by ‘\u0001’
stored as textfile
;
注:
删除内部表会直接删除元数据(metadata)及存储数据;
删除外部表仅仅会删除元数据,HDFS上的文件并不会被删除;

create external table if not exists ods_logs_orc(
ver string,
s_time string,
en string,
u_ud string,
u_mid string,
u_sd string,
c_time string,
l string,
b_iev string,
b_rst string,
p_url string,
p_ref string,
tt string,
pl string,
ip string,
oid string,
on string,
cua string,
cut string,
pt string,
ca string,
ac string,
kv_ string,
du string,
browser_name string,
browser_version string,
os_name string,
os_version string,
country string,
province string,
city string
)
partitioned by (month string,day string)
row format delimited fields terminated by ‘\u0001’
stored as orc
;

导入原始数据:
#load data inpath ‘/ods/11/09/’ into table ods_logs;
我因为没跑mr程序所以没有这个数据,让同学给了一份:/root/dim/part-m-00000
load data local inpath ‘/root/dim/part-m-00000’ into table ods_logs;

将数据导入到orc表中

from ods_logs
insert into ods_logs_orc(
ver ,
s_time ,
en ,
u_ud ,
u_mid ,
u_sd ,
c_time ,
l ,
b_iev ,
b_rst ,
p_url ,
p_ref ,
tt ,
pl ,
ip ,
oid ,
on ,
cua ,
cut ,
pt ,
ca ,
ac ,
kv_ ,
du ,
browser_name ,
browser_version ,
os_name ,
os_version ,
country ,
province ,
city
) partition (month=1,day=‘01’)
select
ver ,
s_time ,
en ,
u_ud ,
u_mid ,
u_sd ,
c_time ,
l ,
b_iev ,
b_rst ,
p_url ,
p_ref ,
tt ,
pl ,
ip ,
oid ,
on ,
cua ,
cut ,
pt ,
ca ,
ac ,
kv_ ,
du ,
browser_name ,
browser_version ,
os_name ,
os_version ,
country ,
province ,
city
;

上面的报错,可以直接简写为:
from ods_logs
insert into ods_logs_orc partition(month=1,day=1)
select *
;

创建dw层的数据
##为新增用户、总用户、活跃用户做计算:
create table if not exists dwd_user(
pl string,
pl_id string,
en string,
en_id string,
browser_name string,
browser_id string,
browser_version string,
browser_version_id string,
province_name string,
province_id string,
city_name string,
city_id string,
uid string
)
partitioned by (month string,day string)
row format delimited fields terminated by ‘\u0001’
stored as orc
;

导入数据:
from (
select
ol.pl,
dp.id as pl_id,
ol.en,
de.id as en_id,
ol.browser_name,
db.id browser_id,
ol.browser_version ,
dv.id browser_version_id,
ol.province ,
pv.id province_id,
ol.city ,
dc.id city_id,
ol.u_ud
from ods.ods_logs_orc ol
left join dim.dim_platform dp on dp.platform_name = ol.pl
left join dim.dim_event_name de on de.name = ol.en
left join dim.dim_browser_name db on db.browser_name = ol.browser_name
left join dim.dim_browser_version dv on dv.browser_version = ol.browser_version
left join dim.dim_province pv on pv.province = ol.province
left join dim.dim_city dc on dc.city = ol.city
) tmp
insert into dwd_user partition (month=‘1’,day = ‘01’)
select *
;


另一种写法:
with tmp as (
select
ol.pl,
dp.id as pl_id,
ol.en,
de.id as en_id,
ol.browser_name,
db.id browser_id,
ol.browser_version ,
dv.id browser_version_id,
ol.province_name ,
pv.id province_id,
ol.city_name ,
dc.id city_id,
ol.uid
from ods_logs_orc ol
left join dim_platform dp on dp.platform_name = ol.pl
left join dim_event_name de on de.name = ol.en
left join dim_browser_name db on db.browser_name = ol.browser_Name
left join dim_browser_version dv on dv.browser_version = ol.browser_version
left join dim_province pv on pv.province = ol.province_name
left join dim_city dc on dc.city = ol.city_name
)
insert into dwd_user partition (month=‘1’,day = ‘01’)
select * from tmp
;

————————————————
版权声明:本文为CSDN博主「大岛君」的原创文章,遵循 CC 4.0 BY-SA 版权协议,转载请附上原文出处链接及本声明。
原文链接:https://blog.csdn.net/myloveprogrmming/article/details/85813114

标签:数仓,ol,string,dim,hive,构建,browser,id,name
来源: https://blog.csdn.net/weixin_42205780/article/details/100183216

本站声明: 1. iCode9 技术分享网(下文简称本站)提供的所有内容,仅供技术学习、探讨和分享;
2. 关于本站的所有留言、评论、转载及引用,纯属内容发起人的个人观点,与本站观点和立场无关;
3. 关于本站的所有言论和文字,纯属内容发起人的个人观点,与本站观点和立场无关;
4. 本站文章均是网友提供,不完全保证技术分享内容的完整性、准确性、时效性、风险性和版权归属;如您发现该文章侵犯了您的权益,可联系我们第一时间进行删除;
5. 本站为非盈利性的个人网站,所有内容不会用来进行牟利,也不会利用任何形式的广告来间接获益,纯粹是为了广大技术爱好者提供技术内容和技术思想的分享性交流网站。

专注分享技术,共同学习,共同进步。侵权联系[81616952@qq.com]

Copyright (C)ICode9.com, All Rights Reserved.

ICode9版权所有