淘先锋技术网

首页 1 2 3 4 5 6 7
-- 创建hive表
drop table test.dws_user_baseinfo;
create external table test.dws_user_baseinfo
(
    account_id string,
    tag_name   string,
    tag_value  string
)
    partitioned by (dt string)
    row format delimited fields terminated by ',';

-- 构造测试数据,tmp.txt内容如下
45394175,client_language,ar_SA
58241491,client_language,ar_SA

-- 导入hive表
load data local inpath 'tmp.txt' overwrite into table test.dws_user_baseinfo partition(dt='2022-07-30');

-- 创建bitmap表
drop table test.dws_user_baseinfo_bitmap;
create external table test.dws_user_baseinfo_bitmap
(
    tag_name       string,
    tag_value      string,
    account_bitmap binary
)
    partitioned by (dt string)
    stored as parquet
    tblproperties ('parquet.compression' = 'snappy');

-- 注册bitmap udf
create function to_bitmap as 'com.**.bigdata.udf.ToBitmapUDAF' using jar 'hdfs:/user/hive/jars/hive-bitmap-udf.jar';
create function bitmap_union as 'com.**.bigdata.udf.BitmapUnionUDAF' using jar 'hdfs:/user/hive/jars/hive-bitmap-udf.jar';
create function bitmap_count as 'com.**.bigdata.udf.BitmapCountUDF' using jar 'hdfs:/user/hive/jars/hive-bitmap-udf.jar';


-- 用spark sql运行如下语句
insert overwrite table test.dws_user_baseinfo_bitmap partition(dt='2022-07-30')
select
    tag_name,
    tag_value,
    to_bitmap(cast(account_id as bigint)) as account_bitmap
from test.dws_user_baseinfo
where dt = '2022-07-30'
group by tag_name,tag_value;

-- 在hive表可以查到结果应该是client_language ar_SA   2
select
    tag_name,
    tag_value,
    bitmap_count(account_bitmap)
from test.dws_user_baseinfo_bitmap;

-- 创建doris bitmap表
create table `test_bitmap`
(
    `dt`        varchar(45) null comment "",
    `tag_name`  varchar(45) null comment "",
    `tag_value` varchar(45) null comment "",
    `account_bitmap` bitmap BITMAP_UNION null comment ""
) ENGINE = OLAP
    AGGREGATE key(`dt`, `tag_name`, `tag_value`) comment "OLAP"
    DISTRIBUTED by HASH(`dt`) buckets 2 PROPERTIES('replication_num' = '1');

-- 创建hive外表
create external table `test_bitmap_hive`
(
    `tag_name`       varchar(45),
    `tag_value`      varchar(45),
    `account_bitmap` varchar(500),
    `dt`             varchar(100)
) ENGINE = hive comment "HIVE" PROPERTIES (
  'hive.metastore.uris' = 'thrift://master-01:9083,thrift://master-02:9083',
  'database' = 'test',
  'table' = 'dws_user_baseinfo_bitmap'
);

-- hive bitmap数据导入doris
load label test (
    data
    from
    table test_bitmap_hive into table test_bitmap
    set
    (account_bitmap = binary_bitmap(account_bitmap))
    ) with resource 'spark'

-- 在doris表查到的结果却是client_language ar_SA   1
select
    tag_name,
    tag_value,
    bitmap_count(account_bitmap)
from test_bitmap

-- 如果查询语句换成如下则结果是2,所以我想导入数据的时候是不是错误地把bitmap列count之后再存放进去,实际上应该放account的列表
select
    tag_name,
    tag_value,
    bitmap_to_string(account_bitmap)
from test_bitmap