redis 是最值得学习的开源项目,

  1. 它广泛应用
  2. 它实现了单机缓存数据库,同时支持网络访问、复制、集群、订阅等高级特性
  3. redis依赖很少,数据结构、日志等基础库也是自行实现,有利于学习
  4. redis代码精悍、质量高,甚至很难找到优化点。

本文阐释redis数据结构,redis 数据结构设计的一大特点是节省内存。原因是redis作为内存数据库需要节省内存使用(连续内存的数组能减少内存碎片)。这对架构设计同样有借鉴意义,通过节省元数据内存使用来把更多元数据/索引放在内存上,可以大大提高处理的吞吐和延迟。

数据结构

在redis的数据结构中

  1. sds 最常用做dict的key
  2. list 作为redis 内部结构使用,例如事件列表,客户端列表
  3. dict 作为数据库使用
  4. redisObject 作为数据库的value使用,object可以划分成string, list, hashtable, set, sorted set 五种基本类型
    1. string 通过sds实现
    2. list 通过quicklist实现;quicklist又是list和ziplist的组合
    3. hashtable,少量数据通过ziplist实现(减小内存碎片),大量数据通过dict实现
    4. set,整数且少量数据通过intset实现,大量数据通过dict实现
    5. sorted set,少量数据通过ziplist实现,大量数据通过dict+skiplist实现(skiplist用来保序)

sds, simple dynamic string

sds 就是动态字符串,在C char* 之上的封装,增加了常用的变量和成员函数。sds 在redis使用广泛,最常用的是作为dict的key。

sds有sdshdr8,sdshdr16,sdshdr32等。后面的8, 16, 32 表示sds的最长长度为2^8, 2^16, 2^32(128byte, 64K, 4M)。一般来说,sdshdr8和sdshdr16就够用了

一般来说

1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
/* Note: sdshdr5 is never used, we just access the flags byte directly.
* However is here to document the layout of type 5 SDS strings. */
struct __attribute__ ((__packed__)) sdshdr5 {
unsigned char flags; /* 3 lsb of type, and 5 msb of string length */
char buf[];
};
struct __attribute__ ((__packed__)) sdshdr8 {
uint8_t len; /* used */
uint8_t alloc; /* excluding the header and null terminator */
unsigned char flags; /* 3 lsb of type, 5 unused bits */
char buf[];
};
struct __attribute__ ((__packed__)) sdshdr16 {
uint16_t len; /* used */
uint16_t alloc; /* excluding the header and null terminator */
unsigned char flags; /* 3 lsb of type, 5 unused bits */
char buf[];
};
struct __attribute__ ((__packed__)) sdshdr32 {
uint32_t len; /* used */
uint32_t alloc; /* excluding the header and null terminator */
unsigned char flags; /* 3 lsb of type, 5 unused bits */
char buf[];
};
struct __attribute__ ((__packed__)) sdshdr64 {
uint64_t len; /* used */
uint64_t alloc; /* excluding the header and null terminator */
unsigned char flags; /* 3 lsb of type, 5 unused bits */
char buf[];
};
  1. buf 实际存放数据,等同于C语言的char *
  2. 获得长度不需要遍历buf,只需要将(s)-(sizeof(struct sdshdr##T))强制转型为struct sdshdr##T,然后调用sdshdr##T的方法即可

SDS_HDR(T,s) 表示就s 转型为struct sdshdr##T

1
2
3
4
5
6
7
8
9
10
11
12
typedef char *sds;

#define SDS_HDR(T,s) ((struct sdshdr##T *)((s)-(sizeof(struct sdshdr##T))))

static inline size_t sdslen(const sds s) {
unsigned char flags = s[-1];
switch(flags&SDS_TYPE_MASK) {
case SDS_TYPE_5:
return SDS_TYPE_5_LEN(flags);
case SDS_TYPE_8:
return SDS_HDR(8,s)->len;
...

创建sds,传入C风格的char *,

  1. 通过strlen获得C风格字符串长度
  2. 根据长度确定type
  3. 构造sdsHeader
  4. memcpy(s, init, initlen); 将C风格字符串 copy到sds 的buf成员
    1
    2
    3
    4
    5
    6
    7
    8
    9
    10
    11
    12
    13
    14
    15
    16
    17
    18
    19
    20
    21
    22
    23
    24
    25
    26
    27
    28
    29
    30
    31
    32
    33
    34
    35
    36
    37
    38
    39
    40
    41
    42
    43
    /* Create a new sds string starting from a null terminated C string. */
    sds sdsnew(const char *init) {
    size_t initlen = (init == NULL) ? 0 : strlen(init);
    return sdsnewlen(init, initlen);
    }

    sds sdsnewlen(const void *init, size_t initlen) {
    void *sh;
    sds s;
    char type = sdsReqType(initlen);
    /* Empty strings are usually created in order to append. Use type 8
    * since type 5 is not good at this. */
    if (type == SDS_TYPE_5 && initlen == 0) type = SDS_TYPE_8;
    int hdrlen = sdsHdrSize(type);
    unsigned char *fp; /* flags pointer. */

    sh = s_malloc(hdrlen+initlen+1);
    if (!init)
    memset(sh, 0, hdrlen+initlen+1);
    if (sh == NULL) return NULL;
    s = (char*)sh+hdrlen;
    fp = ((unsigned char*)s)-1;
    switch(type) {
    case SDS_TYPE_5: {
    *fp = type | (initlen << SDS_TYPE_BITS);
    break;
    }
    case SDS_TYPE_8: {
    SDS_HDR_VAR(8,s);
    sh->len = initlen;
    sh->alloc = initlen;
    *fp = type;
    break;
    }
    ...
    }
    if (initlen && init)
    memcpy(s, init, initlen);
    s[initlen] = '\0';
    return s;
    }

    #define SDS_HDR_VAR(T,s) struct sdshdr##T *sh = (void*)((s)-(sizeof(struct sdshdr##T)));

sds还实现了trim, cmp等常用函数, 是s字符串设计的良好参考

list

双向链表, 增加了len成员变量记录链表的长度

redis 双向链表主要应用在系统结构,例如事件列表fileEvents,定时器事件列表,client列表,slave列表等

1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
typedef struct listNode {
struct listNode *prev;
struct listNode *next;
void *value;
} listNode;

typedef struct listIter {
listNode *next;
int direction;
} listIter;

typedef struct list {
listNode *head;
listNode *tail;
void *(*dup)(void *ptr);
void (*free)(void *ptr);
int (*match)(void *ptr, void *key);
unsigned long len;
} list;

dict

dict 是kv哈希表,是redis 的核心存储单元。redis 接收到的kv 就是存放在dict结构里。

每个dict对象有两个dictht,每个dictht是一个hash表。使用两个hash表是后续用来rehash。rehash是哈希表扩容,哈希表扩容期间,由于长度变大,会导致key重新排布。rehash的期望是在key重新排布期间,不影响前台IO对哈希表的读写操作。

  1. redis rehash线程和前台操作使用相同线程,可以避免对dict加锁
  2. 使用两张表,可以渐进式rehash,每次迁移一个桶。rehash期间,表1的长度是旧长度,表2是新长度。读操作需要先读2,后读1;写操作只需要写表2,删除更新操作也需要同时处理两个表。
  3. rehash时,只需要复制key和value的指针,不需要拷贝数据,因此对内存的消耗有限
    1
    2
    3
    4
    5
    6
    7
    8
    9
    10
    11
    12
    13
    14
    15
    16
    17
    18
    19
    20
    21
    22
    23
    24
    25
    26
    27
    28
    29
    30
    31
    32
    33
    34
    35
    36
    37
    38
    39
    40
    41
    42
    43
    44
    45
    46
    47
    48
    49
    typedef struct dictEntry {
    void *key;
    union {
    void *val;
    uint64_t u64;
    int64_t s64;
    double d;
    } v;
    struct dictEntry *next;
    } dictEntry;

    typedef struct dictType {
    uint64_t (*hashFunction)(const void *key);
    void *(*keyDup)(void *privdata, const void *key);
    void *(*valDup)(void *privdata, const void *obj);
    int (*keyCompare)(void *privdata, const void *key1, const void *key2);
    void (*keyDestructor)(void *privdata, void *key);
    void (*valDestructor)(void *privdata, void *obj);
    } dictType;

    /* This is our hash table structure. Every dictionary has two of this as we
    * implement incremental rehashing, for the old to the new table. */
    typedef struct dictht {
    dictEntry **table;
    unsigned long size;
    unsigned long sizemask;
    unsigned long used;
    } dictht;

    typedef struct dict {
    dictType *type;
    void *privdata;
    dictht ht[2];
    long rehashidx; /* rehashing not in progress if rehashidx == -1 */
    unsigned long iterators; /* number of iterators currently running */
    } dict;

    /* If safe is set to 1 this is a safe iterator, that means, you can call
    * dictAdd, dictFind, and other functions against the dictionary even while
    * iterating. Otherwise it is a non safe iterator, and only dictNext()
    * should be called while iterating. */
    typedef struct dictIterator {
    dict *d;
    long index;
    int table, safe;
    dictEntry *entry, *nextEntry;
    /* unsafe iterator fingerprint for misuse detection. */
    long long fingerprint;
    } dictIterator;

object

object最常见的使用是作为dict的value存在,

redis key 只有sds一种类型,value有五种类型。

  1. String字符串,支持SET, GET命令,底层使用sds实现
  2. List 有序可重复的列表,支持LPUSH, RPOP, LRANGE命令,底层使用quicklist实现
  3. Hash 键值对集合,支持HSET, HGET, HGETALL,底层使用ziplist, hashtable实现
  4. Set 无序唯一的集合,支持SADD, SMEMBER,底层使用intset, hashtable实现
  5. Sorted Set有序唯一的集合(带权重),支持ZADD, ZRANGE, ZRANK命令,底层使用ziplist, skiplist+hashtable组合结构实现

intset 只用来实现set,不用来实现sorted set,原因是sorted set的entry需要有参数作为排序的权重, intset不能支持。

其中redis的hashtable就是dict

1
2
3
4
5
6
7
typedef struct dict {
dictType *type;
void *privdata;
dictht ht[2];
long rehashidx; /* rehashing not in progress if rehashidx == -1 */
unsigned long iterators; /* number of iterators currently running */
} dict;

redis 的常用操作命令

1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
# 设置key value, value 
SET key value

# value 是列表
LPUSH key value1 value2 # 左侧插入
RPUSH key value1 value2 # 右侧插入
LPOP key # 左侧弹出元素
RPOP key # 右侧弹出元素

# value是哈希表
HSET key field1 val1 field2 val2 # 批量设置<field val>
HMGET key field1 field2 # 批量获取

# value是集合
SADD key member1 member2 # 添加成员
SMEMBERS key # 获取所有成员

# value是有序集合
ZADD key score1 member1 score2 member2 # 添加带分值的成员
ZRANGE key start stop [WITHSCORES] # 按分值升序获取成员

object核心的成员,type,encoding,ptr

1
2
3
4
5
6
7
8
9
typedef struct redisObject {
unsigned type:4;
unsigned encoding:4;
unsigned lru:LRU_BITS; /* LRU time (relative to global lru_clock) or
* LFU data (least significant 8 bits frequency
* and most significant 16 bits access time). */
int refcount;
void *ptr;
} robj;

encoding有10种, robj可以看成不同实现数据结构的统一接口

1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
/* Objects encoding. Some kind of objects like Strings and Hashes can be
* internally represented in multiple ways. The 'encoding' field of the object
* is set to one of this fields for this object. */
#define OBJ_ENCODING_RAW 0 /* Raw representation */
#define OBJ_ENCODING_INT 1 /* Encoded as integer */
#define OBJ_ENCODING_HT 2 /* Encoded as hash table */
#define OBJ_ENCODING_ZIPMAP 3 /* Encoded as zipmap */
#define OBJ_ENCODING_LINKEDLIST 4 /* No longer used: old list encoding. */
#define OBJ_ENCODING_ZIPLIST 5 /* Encoded as ziplist */
#define OBJ_ENCODING_INTSET 6 /* Encoded as intset */
#define OBJ_ENCODING_SKIPLIST 7 /* Encoded as skiplist */
#define OBJ_ENCODING_EMBSTR 8 /* Embedded sds string encoding */
#define OBJ_ENCODING_QUICKLIST 9 /* Encoded as linked list of ziplists */

robj *createQuicklistObject(void) {
quicklist *l = quicklistCreate();
robj *o = createObject(OBJ_LIST,l);
o->encoding = OBJ_ENCODING_QUICKLIST;
return o;
}

robj *createZiplistObject(void) {
unsigned char *zl = ziplistNew();
robj *o = createObject(OBJ_LIST,zl);
o->encoding = OBJ_ENCODING_ZIPLIST;
return o;
}

robj *createSetObject(void) {
dict *d = dictCreate(&setDictType,NULL);
robj *o = createObject(OBJ_SET,d);
o->encoding = OBJ_ENCODING_HT;
return o;
}

robj *createIntsetObject(void) {
intset *is = intsetNew();
robj *o = createObject(OBJ_SET,is);
o->encoding = OBJ_ENCODING_INTSET;
return o;
}

object 可以序列化到rdb,也可以被load

1
2
3
4
5
6
7
8
9
10
11
12
/* Load a Redis object of the specified type from the specified file.
* On success a newly allocated object is returned, otherwise NULL. */
robj *rdbLoadObject(int rdbtype, rio *rdb) {
robj *o = NULL, *ele, *dec;
uint64_t len;
unsigned int i;

if (rdbtype == RDB_TYPE_STRING) {
/* Read string value */
if ((o = rdbLoadEncodedStringObject(rdb)) == NULL) return NULL;
o = tryObjectEncoding(o);
...

quicklist

quicklist 用来实现列表类型value。

quicklist 是一个由多个 ziplist(压缩列表)节点组成的双向链表。quicklist的节点就是 ziplist。

quicklist支持对 ziplist 节点进行 LZF 压缩, 从而节省内存。
(redis是内存数据库,节省内存的使用是它必须考虑的设计)

1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
/* quicklistNode is a 32 byte struct describing a ziplist for a quicklist.
* We use bit fields keep the quicklistNode at 32 bytes.
* count: 16 bits, max 65536 (max zl bytes is 65k, so max count actually < 32k).
* encoding: 2 bits, RAW=1, LZF=2.
* container: 2 bits, NONE=1, ZIPLIST=2.
* recompress: 1 bit, bool, true if node is temporarry decompressed for usage.
* attempted_compress: 1 bit, boolean, used for verifying during testing.
* extra: 12 bits, free for future use; pads out the remainder of 32 bits */
typedef struct quicklistNode {
struct quicklistNode *prev;
struct quicklistNode *next;
// ziplist
unsigned char *zl;
unsigned int sz; /* ziplist size in bytes */
unsigned int count : 16; /* count of items in ziplist */
unsigned int encoding : 2; /* RAW==1 or LZF==2 */
unsigned int container : 2; /* NONE==1 or ZIPLIST==2 */
unsigned int recompress : 1; /* was this node previous compressed? */
unsigned int attempted_compress : 1; /* node can't compress; too small */
unsigned int extra : 10; /* more bits to steal for future usage */
} quicklistNode;

typedef struct quicklist {
quicklistNode *head;
quicklistNode *tail;
unsigned long count; /* total count of all entries in all ziplists */
unsigned long len; /* number of quicklistNodes */
int fill : 16; /* fill factor for individual nodes */
unsigned int compress : 16; /* depth of end nodes not to compress;0=off */
} quicklist;

typedef struct quicklistIter {
const quicklist *quicklist;
quicklistNode *current;
unsigned char *zi;
long offset; /* offset in current ziplist */
int direction;
} quicklistIter;

typedef struct quicklistEntry {
const quicklist *quicklist;
quicklistNode *node;
unsigned char *zi;
unsigned char *value;
long long longval;
unsigned int sz;
int offset;
} quicklistEntry;

ziplist

quicklist的元素是ziplist, 除了quicklist, ziplist还作为元素数量少的 hashtable和sorted set的存储结构。

ziplist是一种紧凑的、连续内存存储的线性数据结构,

1
2
3
4
5
6
7
8
9
10
11
12
+---------+--------+--------+--------+--------+---------+
| zlbytes | zltail | zllen | entry1 | entry2 | ... | zlend |
+---------+--------+--------+--------+--------+---------+

zltail(4字节):最后一个节点的偏移量(用于快速定位尾部)。

+------------------+----------------+----------------+
| prev_entry_len | encoding | content |
+------------------+----------------+----------------+

prev_entry_len 前驱节点长度,用于反向遍历
encoding 编码类型

每个entry主要由三部分组成, prev_entry_len,encoding,content。content就是一段序列,char *p。

当ziplist存放hashtable时,content是dictEntry。当ziplist存放sorted set是,content就是sds。

1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21


typedef struct zlentry {
unsigned int prevrawlensize; /* Bytes used to encode the previos entry len*/
unsigned int prevrawlen; /* Previous entry len. */
unsigned int lensize; /* Bytes used to encode this entry type/len.
For example strings have a 1, 2 or 5 bytes
header. Integers always use a single byte.*/
unsigned int len; /* Bytes used to represent the actual entry.
For strings this is just the string length
while for integers it is 1, 2, 3, 4, 8 or
0 (for 4 bit immediate) depending on the
number range. */
unsigned int headersize; /* prevrawlensize + lensize. */
unsigned char encoding; /* Set to ZIP_STR_* or ZIP_INT_* depending on
the entry encoding. However for 4 bits
immediate integers this can assume a range
of values and must be range-checked. */
unsigned char *p; /* Pointer to the very start of the entry, that
is, this points to prev-entry-len field. */
} zlentry;

intset

intset 整数集合用来实现元素数量较小的set,需要满足

  1. 元素均为整数
  2. 元素个数小于阈值set-max-intset-entries 512

intset 采用连续内存存储,内部元素有序排布,支持二分查找

intset 支持三种整数类型,按需动态升级:

  1. INTSET_ENC_INT16:每个元素占 2 字节(范围 -32,768 ~ 32,767)。
  2. INTSET_ENC_INT32:每个元素占 4 字节(范围 -2^31 ~ 2^31-1)。
  3. INTSET_ENC_INT64:每个元素占 8 字节(范围 -2^63 ~ 2^63-1)。
1
2
3
4
5
6
// intset.h
typedef struct intset {
uint32_t encoding; // 编码方式(INTSET_ENC_INT16/32/64)
uint32_t length; // 元素数量
int8_t contents[]; // 动态数组,按 encoding 存储整数
} intset;

intset add的都是整数

1
2
3
4
/* Insert an integer in the intset */
intset *intsetAdd(intset *is, int64_t value, uint8_t *success) {
uint8_t valenc = _intsetValueEncoding(value);
uint32_t pos;

skiplist和dict结合

跳跃表,和dict 合作实现有序集合

redis 通过哈希表,实现有序集合O(1)的点查询;通过跳跃表,实现集合的有序和范围查询。

哈希表存放zskiplistNode的地址即可,关键成员是sds ele和double score

1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
/* ZSETs use a specialized version of Skiplists */
typedef struct zskiplistNode {
sds ele;
double score;
struct zskiplistNode *backward;
struct zskiplistLevel {
struct zskiplistNode *forward;
unsigned int span;
} level[];
} zskiplistNode;

typedef struct zskiplist {
struct zskiplistNode *header, *tail;
unsigned long length;
int level;
} zskiplist;

typedef struct zset {
dict *dict;
zskiplist *zsl;
} zset;