API Latest Releases: Java Core, C++ Core, Python, Memory, Pig, Hive,

Frequent Items Sketch Hive UDFs

add jar datasketches-memory-2.0.0.jar;
add jar datasketches-java-3.1.0.jar;
add jar datasketches-hive-1.2.0.jar;

create temporary function data2sketch as 'org.apache.datasketches.hive.frequencies.DataToStringsSketchUDAF';
create temporary function union as 'org.apache.datasketches.hive.frequencies.UnionStringsSketchUDAF';
create temporary function get_items as 'org.apache.datasketches.hive.frequencies.GetFrequentItemsFromStringsSketchUDTF';

use <your-db-name-here>;

create temporary table frequent_items_input (value string, category char(2));
-- works with char or varchar types as well
--create temporary table frequent_items_input (value char(2), category char(2));
--create temporary table frequent_items_input (value varchar(2), category char(2));
insert into table frequent_items_input values
  ('a', 'c1'), ('a', 'c1'), ('a', 'c1'), ('a', 'c2'), ('a', 'c1'), ('b', 'c1'), ('c', 'c1'), ('d', 'c1'), ('e', 'c2'), ('a', 'c1'),
  ('a', 'c2'), ('a', 'c2'), ('a', 'c2'), ('d', 'c1'), ('d', 'c2'), ('a', 'c1'), ('a', 'c2'), ('a', 'c1'), ('d', 'c2');

create temporary table frequent_items_intermediate (category char(2), sketch binary);
insert into frequent_items_intermediate select category, data2sketch(value, 16) from frequent_items_input group by category;

select get_items(sketch, 'NO_FALSE_POSITIVES') from frequent_items_intermediate where category='c1';

Output (item, frequency estimate, lower bound, upper bound):
a	7	7	7
d	2	2	2
c	1	1	1
b	1	1	1

select get_items(sketch, 'NO_FALSE_POSITIVES') from frequent_items_intermediate where category='c2';

a	5	5	5
d	2	2	2
e	1	1	1

-- NO_FALSE_POSITIVES is the default and was specified above just for completeness
select get_items(union(sketch)) from frequent_items_intermediate;

a	12	12	12
d	4	4	4
c	1	1	1
b	1	1	1
e	1	1	1