-- 创建hive表 drop table test.dws_user_baseinfo; create external table test.dws_user_baseinfo ( account_id string, tag_name string, tag_value string ) partitioned by (dt string) row format delimited fields terminated by ','; -- 构造测试数据,tmp.txt内容如下 45394175,client_language,ar_SA 58241491,client_language,ar_SA -- 导入hive表 load data local inpath 'tmp.txt' overwrite into table test.dws_user_baseinfo partition(dt='2022-07-30'); -- 创建bitmap表 drop table test.dws_user_baseinfo_bitmap; create external table test.dws_user_baseinfo_bitmap ( tag_name string, tag_value string, account_bitmap binary ) partitioned by (dt string) stored as parquet tblproperties ('parquet.compression' = 'snappy'); -- 注册bitmap udf create function to_bitmap as 'com.**.bigdata.udf.ToBitmapUDAF' using jar 'hdfs:/user/hive/jars/hive-bitmap-udf.jar'; create function bitmap_union as 'com.**.bigdata.udf.BitmapUnionUDAF' using jar 'hdfs:/user/hive/jars/hive-bitmap-udf.jar'; create function bitmap_count as 'com.**.bigdata.udf.BitmapCountUDF' using jar 'hdfs:/user/hive/jars/hive-bitmap-udf.jar'; -- 用spark sql运行如下语句 insert overwrite table test.dws_user_baseinfo_bitmap partition(dt='2022-07-30') select tag_name, tag_value, to_bitmap(cast(account_id as bigint)) as account_bitmap from test.dws_user_baseinfo where dt = '2022-07-30' group by tag_name,tag_value; -- 在hive表可以查到结果应该是client_language ar_SA 2 select tag_name, tag_value, bitmap_count(account_bitmap) from test.dws_user_baseinfo_bitmap; -- 创建doris bitmap表 create table `test_bitmap` ( `dt` varchar(45) null comment "", `tag_name` varchar(45) null comment "", `tag_value` varchar(45) null comment "", `account_bitmap` bitmap BITMAP_UNION null comment "" ) ENGINE = OLAP AGGREGATE key(`dt`, `tag_name`, `tag_value`) comment "OLAP" DISTRIBUTED by HASH(`dt`) buckets 2 PROPERTIES('replication_num' = '1'); -- 创建hive外表 create external table `test_bitmap_hive` ( `tag_name` varchar(45), `tag_value` varchar(45), `account_bitmap` varchar(500), `dt` varchar(100) ) ENGINE = hive comment "HIVE" PROPERTIES ( 'hive.metastore.uris' = 'thrift://master-01:9083,thrift://master-02:9083', 'database' = 'test', 'table' = 'dws_user_baseinfo_bitmap' ); -- hive bitmap数据导入doris load label test ( data from table test_bitmap_hive into table test_bitmap set (account_bitmap = binary_bitmap(account_bitmap)) ) with resource 'spark' -- 在doris表查到的结果却是client_language ar_SA 1 select tag_name, tag_value, bitmap_count(account_bitmap) from test_bitmap -- 如果查询语句换成如下则结果是2,所以我想导入数据的时候是不是错误地把bitmap列count之后再存放进去,实际上应该放account的列表 select tag_name, tag_value, bitmap_to_string(account_bitmap) from test_bitmap