citus 系列7 - topn 加速(count(*) group by order by cou
背景
postgres=# \df topn* List of functions Schema | Name | Result data type | Argument data * | Type --------+------------------+-------------------+---------------------+-------- public | topn | SETOF topn_record | jsonb, integer | normal public | topn_add | jsonb | jsonb, text | normal public | topn_add_agg | jsonb | text | agg public | topn_add_trans | internal | internal, text | normal public | topn_pack | jsonb | internal | normal public | topn_union | jsonb | jsonb, jsonb | normal public | topn_union_agg | jsonb | jsonb | agg public | topn_union_trans | internal | internal, jsonb | normal (8 rows)
-- starting from nothing, record that we saw an "a"
select topn_add('{}', 'a');
-- => {"a": 1}
-- record the sighting of another "a"
select topn_add(topn_add('{}', 'a'), 'a');
-- => {"a": 2}
-- for normal_rand
create extension tablefunc;
-- count values from a normal distribution
SELECT topn_add_agg(floor(abs(i))::text)
FROM normal_rand(1000, 5, 0.7) i;
-- => {"2": 1, "3": 74, "4": 420, "5": 425, "6": 77, "7": 3}
postgres=# select (topn(topn_union_agg(agg_prodid),5)).* from reviews_by_prodid; item | frequency --------+----------- 509594 | 66 497599 | 59 505217 | 58 461257 | 58 403111 | 57 (5 rows)
使用topn
cd ~ . /var/lib/pgsql/.bash_profile git clone https://github.com/citusdata/postgresql-topn cd postgresql-topn USE_PGXS=1 make USE_PGXS=1 make install
postgres=# create extension topn; CREATE EXTENSION
postgres=# select run_command_on_workers('create extension topn;');
run_command_on_workers
--------------------------------------------
(xxx.xxx.xxx.224,1921,t,"CREATE EXTENSION")
(xxx.xxx.xxx.225,1921,t,"CREATE EXTENSION")
(xxx.xxx.xxx.226,1921,t,"CREATE EXTENSION")
(xxx.xxx.xxx.227,1921,t,"CREATE EXTENSION")
(xxx.xxx.xxx.229,1921,t,"CREATE EXTENSION")
(xxx.xxx.xxx.230,1921,t,"CREATE EXTENSION")
(xxx.xxx.xxx.231,1921,t,"CREATE EXTENSION")
(xxx.xxx.xxx.232,1921,t,"CREATE EXTENSION")
(8 rows)
测试
create table tbl(id serial8,gid int, prodid int, c1 int, c2 int);
postgres=# \d tbl
Table "public.tbl"
Column | Type | Collation | Nullable | Default
--------+---------+-----------+----------+---------------------------------
id | bigint | | not null | nextval('tbl_id_seq'::regclass)
gid | integer | | |
prodid | integer | | |
c1 | integer | | |
c2 | integer | | |
postgres=# alter sequence tbl_id_seq cache 10000;
ALTER SEQUENCE
vi test.sql \set gid random_gaussian(1,1000,2.5) \set prodid random_gaussian(1,1000000,2.5) \set c1 random(1,3000) \set c2 random(1,100000000) insert into tbl(gid,prodid,c1,c2) values (:gid,:prodid,:c1,:c2); pgbench -M prepared -n -r -P 1 -f ./test.sql -c 64 -j 64 -T 1200
postgres=# select count(*) from tbl; count ----------- 216524755 (1 row) Time: 421.860 ms
postgres=# select gid,count(*) from tbl group by gid order by count(*) desc limit 10; gid | count -----+-------- 494 | 438102 499 | 438017 514 | 437929 506 | 437852 511 | 437546 509 | 437469 495 | 437458 490 | 437320 496 | 437257 500 | 437239 (10 rows) postgres=# select c1,count(*) from tbl group by c1 order by count(*) desc limit 10; c1 | count ------+------- 1370 | 73175 168 | 73121 1016 | 73114 1816 | 73045 1463 | 73020 585 | 72986 1529 | 72974 1857 | 72944 2580 | 72930 298 | 72917 (10 rows) postgres=# select prodid,count(*) from tbl group by prodid order by count(*) desc limit 10; prodid | count --------+------- 516916 | 534 481914 | 534 520680 | 527 530544 | 526 449685 | 523 493560 | 523 520464 | 523 502098 | 522 495170 | 522 501695 | 522 (10 rows)
CREATE TABLE reviews_by_gid
(
agg jsonb
);
SELECT create_reference_table('reviews_by_gid');
INSERT INTO reviews_by_gid
SELECT topn_add_agg(gid::text)
FROM tbl;
postgres=# select (topn(agg,5)).* from reviews_by_gid;
item | frequency
------+-----------
494 | 438102
499 | 438017
514 | 437929
506 | 437852
511 | 437546
(5 rows)
CREATE TABLE reviews_by_prodid
(
agg_prodid jsonb
);
SELECT create_reference_table('reviews_by_prodid');
INSERT INTO reviews_by_prodid
SELECT topn_add_agg(prodid::text)
FROM tbl;
postgres=# select (topn(agg_prodid,5)).* from reviews_by_prodid;
item | frequency
--------+-----------
470098 | 36
531880 | 35
451724 | 34
420093 | 34
522676 | 33
(5 rows)
CREATE TABLE reviews_by_c1
(
aggc1 jsonb
);
SELECT create_reference_table('reviews_by_c1');
INSERT INTO reviews_by_c1
SELECT topn_add_agg(c1::text)
FROM tbl;
postgres=# select (topn(aggc1,5)).* from reviews_by_c1;
item | frequency
------+-----------
2580 | 37073
1016 | 36162
1983 | 35311
1752 | 35285
2354 | 34740
(5 rows)
精度、截断
/*
* PruneHashTable removes some items from the HashTable to decrease its size. It finds
* minimum and maximum frequencies first and removes the items which have lower frequency
* than the average of them.
*/
static void
PruneHashTable(HTAB *hashTable, int itemLimit, int numberOfRemainingElements)
{
Size topnArraySize = 0;
int topnIndex = 0;
FrequentTopnItem *sortedTopnArray = NULL;
bool itemAlreadyHashed = false;
HASH_SEQ_STATUS status;
FrequentTopnItem *currentTask = NULL;
FrequentTopnItem *frequentTopnItem = NULL;
int index = 0;
int hashTableSize = hash_get_num_entries(hashTable);
if (hashTableSize <= itemLimit)
{
return;
}
/* create an array to copy top-n items and sort them later */
topnArraySize = sizeof(FrequentTopnItem) * hashTableSize;
sortedTopnArray = (FrequentTopnItem *) palloc0(topnArraySize);
hash_seq_init(&status, hashTable);
while ((currentTask = (FrequentTopnItem *) hash_seq_search(&status)) != NULL)
{
frequentTopnItem = palloc0(sizeof(FrequentTopnItem));
memcpy(frequentTopnItem->key, currentTask->key,
sizeof(frequentTopnItem->key));
frequentTopnItem->frequency = currentTask->frequency;
sortedTopnArray[topnIndex] = *frequentTopnItem;
topnIndex++;
}
qsort(sortedTopnArray, hashTableSize, sizeof(FrequentTopnItem),
compareFrequentTopnItem);
for (index = numberOfRemainingElements; index < hashTableSize; index++)
{
FrequentTopnItem *topnItem = &(sortedTopnArray[index]);
hash_search(hashTable, (void *) topnItem->key, HASH_REMOVE,
&itemAlreadyHashed);
}
}
如何修改hash table size
postgres=# load 'topn'; LOAD postgres=# show topn.number_of_counters ; topn.number_of_counters ------------------------- 1000 (1 row) set topn.number_of_counters =20000;
postgresql.conf shared_preload_libraries='citus,topn,pg_stat_statements' topn.number_of_counters=10000
小结
佳实践
参考
相关文章