redis(1)——数据结构

redis 是最值得学习的开源项目，

它广泛应用
它实现了单机缓存数据库，同时支持网络访问、复制、集群、订阅等高级特性
redis依赖很少，数据结构、日志等基础库也是自行实现，有利于学习
redis代码精悍、质量高，甚至很难找到优化点。

本文阐释redis数据结构，redis 数据结构设计的一大特点是节省内存。原因是redis作为内存数据库需要节省内存使用(连续内存的数组能减少内存碎片)。这对架构设计同样有借鉴意义，通过节省元数据内存使用来把更多元数据/索引放在内存上，可以大大提高处理的吞吐和延迟。

数据结构

在redis的数据结构中

sds 最常用做dict的key
list 作为redis 内部结构使用，例如事件列表，客户端列表
dict 作为数据库使用
redisObject 作为数据库的value使用，object可以划分成string, list, hashtable, set, sorted set 五种基本类型
1. string 通过sds实现
2. list 通过quicklist实现；quicklist又是list和ziplist的组合
3. hashtable，少量数据通过ziplist实现（减小内存碎片），大量数据通过dict实现
4. set，整数且少量数据通过intset实现，大量数据通过dict实现
5. sorted set，少量数据通过ziplist实现，大量数据通过dict+skiplist实现（skiplist用来保序）

sds, simple dynamic string

sds 就是动态字符串，在C char* 之上的封装，增加了常用的变量和成员函数。sds 在redis使用广泛，最常用的是作为dict的key。

sds有sdshdr8，sdshdr16，sdshdr32等。后面的8, 16, 32 表示sds的最长长度为2^8, 2^16, 2^32(128byte, 64K, 4M)。一般来说，sdshdr8和sdshdr16就够用了

一般来说

/* Note: sdshdr5 is never used, we just access the flags byte directly.
 * However is here to document the layout of type 5 SDS strings. */
struct __attribute__ ((__packed__)) sdshdr5 {
    unsigned char flags; /* 3 lsb of type, and 5 msb of string length */
    char buf[];
};
struct __attribute__ ((__packed__)) sdshdr8 {
    uint8_t len; /* used */
    uint8_t alloc; /* excluding the header and null terminator */
    unsigned char flags; /* 3 lsb of type, 5 unused bits */
    char buf[];
};
struct __attribute__ ((__packed__)) sdshdr16 {
    uint16_t len; /* used */
    uint16_t alloc; /* excluding the header and null terminator */
    unsigned char flags; /* 3 lsb of type, 5 unused bits */
    char buf[];
};
struct __attribute__ ((__packed__)) sdshdr32 {
    uint32_t len; /* used */
    uint32_t alloc; /* excluding the header and null terminator */
    unsigned char flags; /* 3 lsb of type, 5 unused bits */
    char buf[];
};
struct __attribute__ ((__packed__)) sdshdr64 {
    uint64_t len; /* used */
    uint64_t alloc; /* excluding the header and null terminator */
    unsigned char flags; /* 3 lsb of type, 5 unused bits */
    char buf[];
};

buf 实际存放数据，等同于C语言的char *
获得长度不需要遍历buf，只需要将(s)-(sizeof(struct sdshdr##T))强制转型为struct sdshdr##T，然后调用sdshdr##T的方法即可

SDS_HDR(T,s) 表示就s 转型为struct sdshdr##T

typedef char *sds;

#define SDS_HDR(T,s) ((struct sdshdr##T *)((s)-(sizeof(struct sdshdr##T))))

static inline size_t sdslen(const sds s) {
    unsigned char flags = s[-1];
    switch(flags&SDS_TYPE_MASK) {
        case SDS_TYPE_5:
            return SDS_TYPE_5_LEN(flags);
        case SDS_TYPE_8:
            return SDS_HDR(8,s)->len;
        ...

创建sds，传入C风格的char *，

通过strlen获得C风格字符串长度
根据长度确定type
构造sdsHeader

memcpy(s, init, initlen); 将C风格字符串 copy到sds 的buf成员

/* Create a new sds string starting from a null terminated C string. */
sds sdsnew(const char *init) {
    size_t initlen = (init == NULL) ? 0 : strlen(init);
    return sdsnewlen(init, initlen);
}

sds sdsnewlen(const void *init, size_t initlen) {
    void *sh;
    sds s;
    char type = sdsReqType(initlen);
    /* Empty strings are usually created in order to append. Use type 8
     * since type 5 is not good at this. */
    if (type == SDS_TYPE_5 && initlen == 0) type = SDS_TYPE_8;
    int hdrlen = sdsHdrSize(type);
    unsigned char *fp; /* flags pointer. */

    sh = s_malloc(hdrlen+initlen+1);
    if (!init)
        memset(sh, 0, hdrlen+initlen+1);
    if (sh == NULL) return NULL;
    s = (char*)sh+hdrlen;
    fp = ((unsigned char*)s)-1;
    switch(type) {
        case SDS_TYPE_5: {
            *fp = type | (initlen << SDS_TYPE_BITS);
            break;
        }
        case SDS_TYPE_8: {
            SDS_HDR_VAR(8,s);
            sh->len = initlen;
            sh->alloc = initlen;
            *fp = type;
            break;
        }
        ...
    }
    if (initlen && init)
        memcpy(s, init, initlen);
    s[initlen] = '\0';
    return s;
}

#define SDS_HDR_VAR(T,s) struct sdshdr##T *sh = (void*)((s)-(sizeof(struct sdshdr##T)));

sds还实现了trim, cmp等常用函数, 是s字符串设计的良好参考

list

双向链表, 增加了len成员变量记录链表的长度

redis 双向链表主要应用在系统结构，例如事件列表fileEvents，定时器事件列表，client列表，slave列表等

typedef struct listNode {
    struct listNode *prev;
    struct listNode *next;
    void *value;
} listNode;

typedef struct listIter {
    listNode *next;
    int direction;
} listIter;

typedef struct list {
    listNode *head;
    listNode *tail;
    void *(*dup)(void *ptr);
    void (*free)(void *ptr);
    int (*match)(void *ptr, void *key);
    unsigned long len;
} list;

dict

dict 是kv哈希表，是redis 的核心存储单元。redis 接收到的kv 就是存放在dict结构里。

每个dict对象有两个dictht，每个dictht是一个hash表。使用两个hash表是后续用来rehash。rehash是哈希表扩容，哈希表扩容期间，由于长度变大，会导致key重新排布。rehash的期望是在key重新排布期间，不影响前台IO对哈希表的读写操作。

redis rehash线程和前台操作使用相同线程，可以避免对dict加锁
使用两张表，可以渐进式rehash，每次迁移一个桶。rehash期间，表1的长度是旧长度，表2是新长度。读操作需要先读2，后读1；写操作只需要写表2，删除更新操作也需要同时处理两个表。

rehash时，只需要复制key和value的指针，不需要拷贝数据，因此对内存的消耗有限

typedef struct dictEntry {
    void *key;
    union {
        void *val;
        uint64_t u64;
        int64_t s64;
        double d;
    } v;
    struct dictEntry *next;
} dictEntry;

typedef struct dictType {
    uint64_t (*hashFunction)(const void *key);
    void *(*keyDup)(void *privdata, const void *key);
    void *(*valDup)(void *privdata, const void *obj);
    int (*keyCompare)(void *privdata, const void *key1, const void *key2);
    void (*keyDestructor)(void *privdata, void *key);
    void (*valDestructor)(void *privdata, void *obj);
} dictType;

/* This is our hash table structure. Every dictionary has two of this as we
 * implement incremental rehashing, for the old to the new table. */
typedef struct dictht {
    dictEntry **table;
    unsigned long size;
    unsigned long sizemask;
    unsigned long used;
} dictht;

typedef struct dict {
    dictType *type;
    void *privdata;
    dictht ht[2];
    long rehashidx; /* rehashing not in progress if rehashidx == -1 */
    unsigned long iterators; /* number of iterators currently running */
} dict;

/* If safe is set to 1 this is a safe iterator, that means, you can call
 * dictAdd, dictFind, and other functions against the dictionary even while
 * iterating. Otherwise it is a non safe iterator, and only dictNext()
 * should be called while iterating. */
typedef struct dictIterator {
    dict *d;
    long index;
    int table, safe;
    dictEntry *entry, *nextEntry;
    /* unsafe iterator fingerprint for misuse detection. */
    long long fingerprint;
} dictIterator;

object

object最常见的使用是作为dict的value存在,

redis key 只有sds一种类型，value有五种类型。

String字符串，支持SET, GET命令，底层使用sds实现
List 有序可重复的列表，支持LPUSH, RPOP, LRANGE命令，底层使用quicklist实现
Hash 键值对集合，支持HSET, HGET, HGETALL，底层使用ziplist, hashtable实现
Set 无序唯一的集合，支持SADD, SMEMBER，底层使用intset, hashtable实现
Sorted Set有序唯一的集合（带权重），支持ZADD, ZRANGE, ZRANK命令，底层使用ziplist, skiplist+hashtable组合结构实现

intset 只用来实现set，不用来实现sorted set，原因是sorted set的entry需要有参数作为排序的权重, intset不能支持。

其中redis的hashtable就是dict

typedef struct dict {
    dictType *type;
    void *privdata;
    dictht ht[2];
    long rehashidx; /* rehashing not in progress if rehashidx == -1 */
    unsigned long iterators; /* number of iterators currently running */
} dict;

redis 的常用操作命令

# 设置key value, value 
SET key value

# value 是列表
LPUSH key value1 value2  # 左侧插入
RPUSH key value1 value2  # 右侧插入
LPOP key                 # 左侧弹出元素
RPOP key                 # 右侧弹出元素

# value是哈希表
HSET key field1 val1 field2 val2   # 批量设置<field val>
HMGET key field1 field2  # 批量获取

# value是集合
SADD key member1 member2  # 添加成员
SMEMBERS key              # 获取所有成员

# value是有序集合
ZADD key score1 member1 score2 member2  # 添加带分值的成员
ZRANGE key start stop [WITHSCORES]      # 按分值升序获取成员

object核心的成员，type，encoding，ptr

typedef struct redisObject {
    unsigned type:4;
    unsigned encoding:4;
    unsigned lru:LRU_BITS; /* LRU time (relative to global lru_clock) or
                            * LFU data (least significant 8 bits frequency
                            * and most significant 16 bits access time). */
    int refcount;
    void *ptr;
} robj;

encoding有10种, robj可以看成不同实现数据结构的统一接口

/* Objects encoding. Some kind of objects like Strings and Hashes can be
 * internally represented in multiple ways. The 'encoding' field of the object
 * is set to one of this fields for this object. */
#define OBJ_ENCODING_RAW 0     /* Raw representation */
#define OBJ_ENCODING_INT 1     /* Encoded as integer */
#define OBJ_ENCODING_HT 2      /* Encoded as hash table */
#define OBJ_ENCODING_ZIPMAP 3  /* Encoded as zipmap */
#define OBJ_ENCODING_LINKEDLIST 4 /* No longer used: old list encoding. */
#define OBJ_ENCODING_ZIPLIST 5 /* Encoded as ziplist */
#define OBJ_ENCODING_INTSET 6  /* Encoded as intset */
#define OBJ_ENCODING_SKIPLIST 7  /* Encoded as skiplist */
#define OBJ_ENCODING_EMBSTR 8  /* Embedded sds string encoding */
#define OBJ_ENCODING_QUICKLIST 9 /* Encoded as linked list of ziplists */

robj *createQuicklistObject(void) {
    quicklist *l = quicklistCreate();
    robj *o = createObject(OBJ_LIST,l);
    o->encoding = OBJ_ENCODING_QUICKLIST;
    return o;
}

robj *createZiplistObject(void) {
    unsigned char *zl = ziplistNew();
    robj *o = createObject(OBJ_LIST,zl);
    o->encoding = OBJ_ENCODING_ZIPLIST;
    return o;
}

robj *createSetObject(void) {
    dict *d = dictCreate(&setDictType,NULL);
    robj *o = createObject(OBJ_SET,d);
    o->encoding = OBJ_ENCODING_HT;
    return o;
}

robj *createIntsetObject(void) {
    intset *is = intsetNew();
    robj *o = createObject(OBJ_SET,is);
    o->encoding = OBJ_ENCODING_INTSET;
    return o;
}

object 可以序列化到rdb，也可以被load

/* Load a Redis object of the specified type from the specified file.
 * On success a newly allocated object is returned, otherwise NULL. */
robj *rdbLoadObject(int rdbtype, rio *rdb) {
    robj *o = NULL, *ele, *dec;
    uint64_t len;
    unsigned int i;

    if (rdbtype == RDB_TYPE_STRING) {
        /* Read string value */
        if ((o = rdbLoadEncodedStringObject(rdb)) == NULL) return NULL;
        o = tryObjectEncoding(o);
...

quicklist

quicklist 用来实现列表类型value。

quicklist 是一个由多个 ziplist（压缩列表）节点组成的双向链表。quicklist的节点就是 ziplist。

quicklist支持对 ziplist 节点进行 LZF 压缩, 从而节省内存。
（redis是内存数据库，节省内存的使用是它必须考虑的设计）

/* quicklistNode is a 32 byte struct describing a ziplist for a quicklist.
 * We use bit fields keep the quicklistNode at 32 bytes.
 * count: 16 bits, max 65536 (max zl bytes is 65k, so max count actually < 32k).
 * encoding: 2 bits, RAW=1, LZF=2.
 * container: 2 bits, NONE=1, ZIPLIST=2.
 * recompress: 1 bit, bool, true if node is temporarry decompressed for usage.
 * attempted_compress: 1 bit, boolean, used for verifying during testing.
 * extra: 12 bits, free for future use; pads out the remainder of 32 bits */
typedef struct quicklistNode {
    struct quicklistNode *prev;
    struct quicklistNode *next;
    // ziplist
    unsigned char *zl;
    unsigned int sz;             /* ziplist size in bytes */
    unsigned int count : 16;     /* count of items in ziplist */
    unsigned int encoding : 2;   /* RAW==1 or LZF==2 */
    unsigned int container : 2;  /* NONE==1 or ZIPLIST==2 */
    unsigned int recompress : 1; /* was this node previous compressed? */
    unsigned int attempted_compress : 1; /* node can't compress; too small */
    unsigned int extra : 10; /* more bits to steal for future usage */
} quicklistNode;

typedef struct quicklist {
    quicklistNode *head;
    quicklistNode *tail;
    unsigned long count;        /* total count of all entries in all ziplists */
    unsigned long len;          /* number of quicklistNodes */
    int fill : 16;              /* fill factor for individual nodes */
    unsigned int compress : 16; /* depth of end nodes not to compress;0=off */
} quicklist;

typedef struct quicklistIter {
    const quicklist *quicklist;
    quicklistNode *current;
    unsigned char *zi;
    long offset; /* offset in current ziplist */
    int direction;
} quicklistIter;

typedef struct quicklistEntry {
    const quicklist *quicklist;
    quicklistNode *node;
    unsigned char *zi;
    unsigned char *value;
    long long longval;
    unsigned int sz;
    int offset;
} quicklistEntry;

ziplist

quicklist的元素是ziplist, 除了quicklist, ziplist还作为元素数量少的 hashtable和sorted set的存储结构。

ziplist是一种紧凑的、连续内存存储的线性数据结构，

+---------+--------+--------+--------+--------+---------+
| zlbytes | zltail | zllen  | entry1 | entry2 | ... | zlend |
+---------+--------+--------+--------+--------+---------+

zltail（4字节）：最后一个节点的偏移量（用于快速定位尾部）。

+------------------+----------------+----------------+
| prev_entry_len   | encoding       | content        |
+------------------+----------------+----------------+

prev_entry_len 前驱节点长度，用于反向遍历
encoding 编码类型

每个entry主要由三部分组成， prev_entry_len，encoding，content。content就是一段序列，char *p。

当ziplist存放hashtable时，content是dictEntry。当ziplist存放sorted set是，content就是sds。



typedef struct zlentry {
    unsigned int prevrawlensize; /* Bytes used to encode the previos entry len*/
    unsigned int prevrawlen;     /* Previous entry len. */
    unsigned int lensize;        /* Bytes used to encode this entry type/len.
                                    For example strings have a 1, 2 or 5 bytes
                                    header. Integers always use a single byte.*/
    unsigned int len;            /* Bytes used to represent the actual entry.
                                    For strings this is just the string length
                                    while for integers it is 1, 2, 3, 4, 8 or
                                    0 (for 4 bit immediate) depending on the
                                    number range. */
    unsigned int headersize;     /* prevrawlensize + lensize. */
    unsigned char encoding;      /* Set to ZIP_STR_* or ZIP_INT_* depending on
                                    the entry encoding. However for 4 bits
                                    immediate integers this can assume a range
                                    of values and must be range-checked. */
    unsigned char *p;            /* Pointer to the very start of the entry, that
                                    is, this points to prev-entry-len field. */
} zlentry;

intset

intset 整数集合用来实现元素数量较小的set，需要满足

元素均为整数
元素个数小于阈值set-max-intset-entries 512

intset 采用连续内存存储，内部元素有序排布，支持二分查找

intset 支持三种整数类型，按需动态升级：

INTSET_ENC_INT16：每个元素占 2 字节（范围 -32,768 ~ 32,767）。
INTSET_ENC_INT32：每个元素占 4 字节（范围 -2^31 ~ 2^31-1）。
INTSET_ENC_INT64：每个元素占 8 字节（范围 -2^63 ~ 2^63-1）。

// intset.h
typedef struct intset {
    uint32_t encoding;  // 编码方式（INTSET_ENC_INT16/32/64）
    uint32_t length;    // 元素数量
    int8_t contents[];  // 动态数组，按 encoding 存储整数
} intset;

intset add的都是整数

/* Insert an integer in the intset */
intset *intsetAdd(intset *is, int64_t value, uint8_t *success) {
    uint8_t valenc = _intsetValueEncoding(value);
    uint32_t pos;

skiplist和dict结合

跳跃表，和dict 合作实现有序集合

redis 通过哈希表，实现有序集合O(1)的点查询；通过跳跃表，实现集合的有序和范围查询。

哈希表存放zskiplistNode的地址即可，关键成员是sds ele和double score

/* ZSETs use a specialized version of Skiplists */
typedef struct zskiplistNode {
    sds ele;
    double score;
    struct zskiplistNode *backward;
    struct zskiplistLevel {
        struct zskiplistNode *forward;
        unsigned int span;
    } level[];
} zskiplistNode;

typedef struct zskiplist {
    struct zskiplistNode *header, *tail;
    unsigned long length;
    int level;
} zskiplist;

typedef struct zset {
    dict *dict;
    zskiplist *zsl;
} zset;