当前位置:网站首页>postgresql之integerset
postgresql之integerset
2022-07-06 18:39:00 【happytree001】
一、简介
1.1 integerset是什么
正如其名,integer set, 整数集合。这里是指存储整数的数据结构。对于数学中集合的定义,集合中的元素是没有重复的。
integerset特点:
能存储64bit整数
内存数据结构,内存操作速度快
使用B-tree结构组织数据,插入、搜索 复杂度O(log2)
使用Simple-8b算法压缩数据,使存储空间更小,还能使用SIMD等指令来加速
1.2 能用来干什么
- 能存储最大64bit的任意正整数
- 快速检索某个整数是否在集合中
1.3 有什么限制
- 插入的值必须是有序的
- 在遍历集合的时候不能插入值
- 不支持删除值
- 不支持负数
1.4 对比redis的intset
| 数据结构 | 可存储最大值 | 组织形式 | 搜索方式 | 存储方式 | 支持负数 | 插入顺序 |
|---|---|---|---|---|---|---|
| integerset | unsigned 64bit | B-tree | 遍历树+折半搜索 | simple-8b编码 | No | 必须有序 |
| intset | signed 64bit | 数组 | 折半搜索 | 直接存值 | Yes | 任意,自动排序 |
1.5 对比redis的listpack变长编码
| 数据结构 | 数值编码方式 | 编码形式 | 编码长度影响因素 |
|---|---|---|---|
| integerset | simple-8b编码 | 批量编码 | 多个连续数值之间的差值大小 |
| listpack | 变长字节编码 | 单数值编码 | 单个数值本身大小 |
二、 数据结构
2.1 数据结构定义
typedef struct intset_node intset_node;
typedef struct intset_leaf_node intset_leaf_node;
typedef struct intset_internal_node intset_internal_node;
/* Common structure of both leaf and internal nodes. */
struct intset_node
{
uint16 level; /* tree level of this node */
uint16 num_items; /* number of items in this node */
};
/* Internal node */
struct intset_internal_node
{
/* common header, must match intset_node */
uint16 level; /* >= 1 on internal nodes */
uint16 num_items;
/* * 'values' is an array of key values, and 'downlinks' are pointers to * lower-level nodes, corresponding to the key values. */
uint64 values[MAX_INTERNAL_ITEMS];
intset_node *downlinks[MAX_INTERNAL_ITEMS];
};
/* Leaf node */
typedef struct
{
uint64 first; /* first integer in this item */
uint64 codeword; /* simple8b encoded differences from 'first' */
} leaf_item;
#define MAX_VALUES_PER_LEAF_ITEM (1 + SIMPLE8B_MAX_VALUES_PER_CODEWORD)
struct intset_leaf_node
{
/* common header, must match intset_node */
uint16 level; /* 0 on leafs */
uint16 num_items;
intset_leaf_node *next; /* right sibling, if any */
leaf_item items[MAX_LEAF_ITEMS];
};
/* * We buffer insertions in a simple array, before packing and inserting them * into the B-tree. MAX_BUFFERED_VALUES sets the size of the buffer. The * encoder assumes that it is large enough that we can always fill a leaf * item with buffered new items. In other words, MAX_BUFFERED_VALUES must be * larger than MAX_VALUES_PER_LEAF_ITEM. For efficiency, make it much larger. */
#define MAX_BUFFERED_VALUES (MAX_VALUES_PER_LEAF_ITEM * 2)
/* * IntegerSet is the top-level object representing the set. * * The integers are stored in an in-memory B-tree structure, plus an array * for newly-added integers. IntegerSet also tracks information about memory * usage, as well as the current position when iterating the set with * intset_begin_iterate / intset_iterate_next. */
struct IntegerSet
{
/* * 'context' is the memory context holding this integer set and all its * tree nodes. * * 'mem_used' tracks the amount of memory used. We don't do anything with * it in integerset.c itself, but the callers can ask for it with * intset_memory_usage(). */
MemoryContext context;
uint64 mem_used;
uint64 num_entries; /* total # of values in the set */
uint64 highest_value; /* highest value stored in this set */
/* * B-tree to hold the packed values. * * 'rightmost_nodes' hold pointers to the rightmost node on each level. * rightmost_parent[0] is rightmost leaf, rightmost_parent[1] is its * parent, and so forth, all the way up to the root. These are needed when * adding new values. (Currently, we require that new values are added at * the end.) */
int num_levels; /* height of the tree */
intset_node *root; /* root node */
intset_node *rightmost_nodes[MAX_TREE_LEVELS];
intset_leaf_node *leftmost_leaf; /* leftmost leaf node */
/* * Holding area for new items that haven't been inserted to the tree yet. */
uint64 buffered_values[MAX_BUFFERED_VALUES];
int num_buffered_values;
/* * Iterator support. * * 'iter_values' is an array of integers ready to be returned to the * caller; 'iter_num_values' is the length of that array, and * 'iter_valueno' is the next index. 'iter_node' and 'iter_itemno' point * to the leaf node, and item within the leaf node, to get the next batch * of values from. * * Normally, 'iter_values' points to 'iter_values_buf', which holds items * decoded from a leaf item. But after we have scanned the whole B-tree, * we iterate through all the unbuffered values, too, by pointing * iter_values to 'buffered_values'. */
bool iter_active; /* is iteration in progress? */
const uint64 *iter_values;
int iter_num_values; /* number of elements in 'iter_values' */
int iter_valueno; /* next index into 'iter_values' */
intset_leaf_node *iter_node; /* current leaf node */
int iter_itemno; /* next item in 'iter_node' to decode */
uint64 iter_values_buf[MAX_VALUES_PER_LEAF_ITEM];
};
其中 intset_node可以看作是基类,其他继承它。
class intset_node {
};
class intset_internal_node : intset_node {
};
class intset_leaf_node : intset_node {
};
2.2 结构图

三、 相关API
3.1 创建integerset
/* * Create a new, initially empty, integer set. * * The integer set is created in the current memory context. * We will do all subsequent allocations in the same context, too, regardless * of which memory context is current when new integers are added to the set. */
IntegerSet *
intset_create(void)
{
IntegerSet *intset;
intset = (IntegerSet *) palloc(sizeof(IntegerSet));
intset->context = CurrentMemoryContext;
intset->mem_used = GetMemoryChunkSpace(intset);
intset->num_entries = 0;
intset->highest_value = 0;
intset->num_levels = 0;
intset->root = NULL;
memset(intset->rightmost_nodes, 0, sizeof(intset->rightmost_nodes));
intset->leftmost_leaf = NULL;
intset->num_buffered_values = 0;
intset->iter_active = false;
intset->iter_node = NULL;
intset->iter_itemno = 0;
intset->iter_valueno = 0;
intset->iter_num_values = 0;
intset->iter_values = NULL;
return intset;
}

3.2 插入元素
/* * Add a value to the set. * * Values must be added in order. */
void
intset_add_member(IntegerSet *intset, uint64 x)
{
if (intset->iter_active)
elog(ERROR, "cannot add new values to integer set while iteration is in progress");
if (x <= intset->highest_value && intset->num_entries > 0)
elog(ERROR, "cannot add value to integer set out of order");
if (intset->num_buffered_values >= MAX_BUFFERED_VALUES)
{
/* Time to flush our buffer */
intset_flush_buffered_values(intset);
Assert(intset->num_buffered_values < MAX_BUFFERED_VALUES);
}
/* Add it to the buffer of newly-added values */
intset->buffered_values[intset->num_buffered_values] = x;
intset->num_buffered_values++;
intset->num_entries++;
intset->highest_value = x;
}
- 首先会加入到缓存区中
- 等缓冲区满后,再插入B树中
(1) 为啥先插入缓冲区,而不是直接插入B-tree
simple-8b编码方式决定的
B-tree中叶子节点中存储的是通过simple-8b编码后的值, 而不是原始值。
(2) 什么是simple-8b编码
simple-8b, 其中b是byte, 使用8字节编码,每次编码后的结果都是8字节。
8byte, 64bit, 其中开始4bit用于选择使用编码模式,不同的编码模式,决定了后面的60bit的如何分配来存储数据。
static const struct simple8b_mode
{
uint8 bits_per_int;
uint8 num_ints;
} simple8b_modes[17] =
{
{
0, 240}, /* mode 0: 240 zeroes */
{
0, 120}, /* mode 1: 120 zeroes */
{
1, 60}, /* mode 2: sixty 1-bit integers */
{
2, 30}, /* mode 3: thirty 2-bit integers */
{
3, 20}, /* mode 4: twenty 3-bit integers */
{
4, 15}, /* mode 5: fifteen 4-bit integers */
{
5, 12}, /* mode 6: twelve 5-bit integers */
{
6, 10}, /* mode 7: ten 6-bit integers */
{
7, 8}, /* mode 8: eight 7-bit integers (four bits * are wasted) */
{
8, 7}, /* mode 9: seven 8-bit integers (four bits * are wasted) */
{
10, 6}, /* mode 10: six 10-bit integers */
{
12, 5}, /* mode 11: five 12-bit integers */
{
15, 4}, /* mode 12: four 15-bit integers */
{
20, 3}, /* mode 13: three 20-bit integers */
{
30, 2}, /* mode 14: two 30-bit integers */
{
60, 1}, /* mode 15: one 60-bit integer */
{
0, 0} /* sentinel value */
};
根据编码模式可以看出,数值越小,能编码的个数就越多,越节省空间。
那如何将需要编码的数值变小呢
对相邻两数差值进行编码,因此数据集越集中,差值越小,能编码的个数越多
只存储差值,如何恢复原始值呢?
为了能恢复原始值,因此编码分为两部分
- 基数base
- 后续数据差值编码
typedef struct
{
uint64 first; // base
uint64 codeword; //编码结果
} leaf_item;

比如使用mode10, 则每个数值使用10bit,总共可以编码6个数值
1010 xxxxxxxxxx xxxxxxxxxx xxxxxxxxxx xxxxxxxxxx xxxxxxxxxx xxxxxxxxxx
只有60 bit用来编码数据,那大于60bit的数值如何编码的呢?
对于大于60bit的数值,将数据存入base, 编码8byte就写UINT64CONST(0x0FFFFFFFFFFFFFFF),编码数据长度0,相当于浪费了8byte。
a. simple-8b编码实现
/* * EMPTY_CODEWORD is a special value, used to indicate "no values". * It is used if the next value is too large to be encoded with Simple-8b. * * This value looks like a mode-0 codeword, but we can distinguish it * because a regular mode-0 codeword would have zeroes in the unused bits. */
#define EMPTY_CODEWORD UINT64CONST(0x0FFFFFFFFFFFFFFF)
/* * Encode a number of integers into a Simple-8b codeword. * * (What we actually encode are deltas between successive integers. * "base" is the value before ints[0].) * * The input array must contain at least SIMPLE8B_MAX_VALUES_PER_CODEWORD * elements, ensuring that we can produce a full codeword. * * Returns the encoded codeword, and sets *num_encoded to the number of * input integers that were encoded. That can be zero, if the first delta * is too large to be encoded. */
static uint64
simple8b_encode(const uint64 *ints, int *num_encoded, uint64 base)
{
int selector;
int nints;
int bits;
uint64 diff;
uint64 last_val;
uint64 codeword;
int i;
Assert(ints[0] > base);
/* * Select the "mode" to use for this codeword. * * In each iteration, check if the next value can be represented in the * current mode we're considering. If it's too large, then step up the * mode to a wider one, and repeat. If it fits, move on to the next * integer. Repeat until the codeword is full, given the current mode. * * Note that we don't have any way to represent unused slots in the * codeword, so we require each codeword to be "full". It is always * possible to produce a full codeword unless the very first delta is too * large to be encoded. For example, if the first delta is small but the * second is too large to be encoded, we'll end up using the last "mode", * which has nints == 1. */
selector = 0;
nints = simple8b_modes[0].num_ints;
bits = simple8b_modes[0].bits_per_int;
diff = ints[0] - base - 1;
last_val = ints[0];
i = 0; /* number of deltas we have accepted */
for (;;)
{
if (diff >= (UINT64CONST(1) << bits))
{
/* too large, step up to next mode */
selector++;
nints = simple8b_modes[selector].num_ints;
bits = simple8b_modes[selector].bits_per_int;
/* we might already have accepted enough deltas for this mode */
if (i >= nints)
break;
}
else
{
/* accept this delta; then done if codeword is full */
i++;
if (i >= nints)
break;
/* examine next delta */
Assert(ints[i] > last_val);
diff = ints[i] - last_val - 1;
last_val = ints[i];
}
}
if (nints == 0)
{
/* * The first delta is too large to be encoded with Simple-8b. * * If there is at least one not-too-large integer in the input, we * will encode it using mode 15 (or a more compact mode). Hence, we * can only get here if the *first* delta is >= 2^60. */
Assert(i == 0);
*num_encoded = 0;
return EMPTY_CODEWORD;
}
/* * Encode the integers using the selected mode. Note that we shift them * into the codeword in reverse order, so that they will come out in the * correct order in the decoder. */
codeword = 0;
if (bits > 0)
{
for (i = nints - 1; i > 0; i--)
{
diff = ints[i] - ints[i - 1] - 1;
codeword |= diff;
codeword <<= bits;
}
diff = ints[0] - base - 1;
codeword |= diff;
}
/* add selector to the codeword, and return */
codeword |= (uint64) selector << 60;
*num_encoded = nints;
return codeword;
}
b. simple-8b解码实现
将8byte的编码数据,恢复到一个数组中。
/* * Decode a codeword into an array of integers. * Returns the number of integers decoded. */
static int
simple8b_decode(uint64 codeword, uint64 *decoded, uint64 base)
{
int selector = (codeword >> 60);
int nints = simple8b_modes[selector].num_ints;
int bits = simple8b_modes[selector].bits_per_int;
uint64 mask = (UINT64CONST(1) << bits) - 1;
uint64 curr_value;
if (codeword == EMPTY_CODEWORD)
return 0;
curr_value = base;
for (int i = 0; i < nints; i++)
{
uint64 diff = codeword & mask;
curr_value += 1 + diff;
decoded[i] = curr_value;
codeword >>= bits;
}
return nints;
}
(3) 比如插入1-600
- 首先是插入缓冲区

- 当缓冲区写满后,将进行编码以及插入B-tree

- 剩余数据继续插入缓冲区

每次编码结束后,如果缓冲区中还有剩余的数据,将会拷贝到缓冲区开头
我尝试将这个缓冲区改造成一个循环队列,这样就不用进行数据的拷贝了。但是对于一些使用缓冲区 的地方都要加求余操作,看起来就没有原来的代码更清晰简单,而且随着队列的使用,将会将数组分成两个部分,两个部分依然有序,还是使用折半搜索,一次尝试还是不错的,调试,当能正常工作后,还是挺高兴的
3.2 判断某个值是否在integerset中
- 首先判断是否在缓冲区中,缓冲区是有序递增的,所以使用折半搜索
- 判断值在哪个节点范围内,因为B-tree特性,也使用折半搜索
- 最后在simple-8b编码数据中搜索, simple8b_contains函数和simple8b_decode类似
/* * Does the set contain the given value? */
bool
intset_is_member(IntegerSet *intset, uint64 x)
{
intset_node *node;
intset_leaf_node *leaf;
int level;
int itemno;
leaf_item *item;
/* * The value might be in the buffer of newly-added values. */
if (intset->num_buffered_values > 0 && x >= intset->buffered_values[0])
{
int itemno;
itemno = intset_binsrch_uint64(x,
intset->buffered_values,
intset->num_buffered_values,
false);
if (itemno >= intset->num_buffered_values)
return false;
else
return (intset->buffered_values[itemno] == x);
}
/* * Start from the root, and walk down the B-tree to find the right leaf * node. */
if (!intset->root)
return false;
node = intset->root;
for (level = intset->num_levels - 1; level > 0; level--)
{
intset_internal_node *n = (intset_internal_node *) node;
Assert(node->level == level);
itemno = intset_binsrch_uint64(x, n->values, n->num_items, true);
if (itemno == 0)
return false;
node = n->downlinks[itemno - 1];
}
Assert(node->level == 0);
leaf = (intset_leaf_node *) node;
/* * Binary search to find the right item on the leaf page */
itemno = intset_binsrch_leaf(x, leaf->items, leaf->num_items, true);
if (itemno == 0)
return false;
item = &leaf->items[itemno - 1];
/* Is this a match to the first value on the item? */
if (item->first == x)
return true;
Assert(x > item->first);
/* Is it in the packed codeword? */
if (simple8b_contains(item->codeword, x, item->first))
return true;
return false;
}
3.3 遍历整个integerset
/* * Begin in-order scan through all the values. * * While the iteration is in-progress, you cannot add new values to the set. */
void
intset_begin_iterate(IntegerSet *intset)
{
/* Note that we allow an iteration to be abandoned midway */
intset->iter_active = true;
intset->iter_node = intset->leftmost_leaf;
intset->iter_itemno = 0;
intset->iter_valueno = 0;
intset->iter_num_values = 0;
intset->iter_values = intset->iter_values_buf;
}
/* * Returns the next integer, when iterating. * * intset_begin_iterate() must be called first. intset_iterate_next() returns * the next value in the set. Returns true, if there was another value, and * stores the value in *next. Otherwise, returns false. */
bool
intset_iterate_next(IntegerSet *intset, uint64 *next)
{
Assert(intset->iter_active);
for (;;)
{
/* Return next iter_values[] entry if any */
if (intset->iter_valueno < intset->iter_num_values)
{
*next = intset->iter_values[intset->iter_valueno++];
return true;
}
/* Decode next item in current leaf node, if any */
if (intset->iter_node &&
intset->iter_itemno < intset->iter_node->num_items)
{
leaf_item *item;
int num_decoded;
item = &intset->iter_node->items[intset->iter_itemno++];
intset->iter_values_buf[0] = item->first;
num_decoded = simple8b_decode(item->codeword,
&intset->iter_values_buf[1],
item->first);
intset->iter_num_values = num_decoded + 1;
intset->iter_valueno = 0;
continue;
}
/* No more items on this leaf, step to next node */
if (intset->iter_node)
{
intset->iter_node = intset->iter_node->next;
intset->iter_itemno = 0;
continue;
}
/* * We have reached the end of the B-tree. But we might still have * some integers in the buffer of newly-added values. */
if (intset->iter_values == (const uint64 *) intset->iter_values_buf)
{
intset->iter_values = intset->buffered_values;
intset->iter_num_values = intset->num_buffered_values;
intset->iter_valueno = 0;
continue;
}
break;
}
/* No more results. */
intset->iter_active = false;
*next = 0; /* prevent uninitialized-variable warnings */
return false;
}
从最左边的叶子节点开始遍历,叶子节点通过next指针连接成了一个链表,逐一遍历
每个叶子节点中有多个item, 逐一遍历item
每个item又是经过simple-8b编码的数据,通过解码,将数据写入临时缓冲区中
每次都从临时缓冲区中返回一个数据
当遍历完所有节点后,进行遍历缓冲区
四、总结
integerset是一个正整数集合,最大值为64bit
使用B-tree结构进行管理数据
使用simple-8b编码数据,进行压缩
插入顺序必须有序
不支持删除数据
数据压缩率依赖数据之间的差距
迭代遍历过程中不能插入数据,未实现
搜索过程都使用了折半搜索
批量编码数据
待把B-tree学习清楚后,可以尝试实现非有序方式插入
边栏推荐
- TiFlash 源码阅读(四)TiFlash DDL 模块设计及实现分析
- Several classes and functions that must be clarified when using Ceres to slam
- BigDecimal 的正确使用方式
- 传感器:土壤湿度传感器(XH-M214)介绍及stm32驱动代码
- [paper reading | deep reading] dngr:deep neural networks for learning graph representations
- Tips for web development: skillfully use ThreadLocal to avoid layer by layer value transmission
- [unity] upgraded version · Excel data analysis, automatically create corresponding C classes, automatically create scriptableobject generation classes, and automatically serialize asset files
- What to do when encountering slow SQL? (next)
- Lidar: introduction and usage of ouster OS
- FLIR blackfly s usb3 industrial camera: white balance setting method
猜你喜欢

SchedulX V1.4.0及SaaS版发布,免费体验降本增效高级功能!

Robot team learning method to achieve 8.8 times human return

如何从0到1构建32Core树莓派集群
![[paper reading | deep reading] graphsage:inductive representation learning on large graphs](/img/fe/3d4ccfa79a9624ac4901feeea1e69f.png)
[paper reading | deep reading] graphsage:inductive representation learning on large graphs

Time synchronization of livox lidar hardware -- PPS method

【论文阅读|深读】DNGR:Deep Neural Networks for Learning Graph Representations

TiFlash 源码阅读(四)TiFlash DDL 模块设计及实现分析

Schedulx v1.4.0 and SaaS versions are released, and you can experience the advanced functions of cost reduction and efficiency increase for free!

2022/0524/bookstrap

ROS learning (26) dynamic parameter configuration
随机推荐
Ros Learning (23) Action Communication Mechanism
传感器:DS1302时钟芯片及驱动代码
微服务架构介绍
sql中批量删除数据---实体中的集合
FLIR blackfly s industrial camera: auto exposure configuration and code
MetaForce原力元宇宙开发搭建丨佛萨奇2.0系统开发
Date processing tool class dateutils (tool class 1)
leetcode:736. Lisp 语法解析【花里胡哨 + 栈 + 状态enumaotu + slots】
RC振荡器和晶体振荡器简介
Tiflash source code reading (IV) design and implementation analysis of tiflash DDL module
ROS learning (24) plugin
go swagger使用
How can reinforcement learning be used in medical imaging? A review of Emory University's latest "reinforcement learning medical image analysis", which expounds the latest RL medical image analysis co
ROS learning (XX) robot slam function package -- installation and testing of rgbdslam
Seconds understand the delay and timing function of wechat applet
The use of video in the wiper component causes full screen dislocation
#yyds干货盘点# 解决名企真题:最大差值
企业中台建设新路径——低代码平台
Lumion 11.0 software installation package download and installation tutorial
Recognition of C language array