redis7.x源码分析:(7) 过期键处理

我们设置了过期时间的键，除了在db中保存，同时还会在expires中保存。

typedef struct redisDb {
    // 数据存储字典
    dict *dict;                 /* The keyspace for this DB */
    // 设置超时的key字典
    dict *expires;              /* Timeout of keys with a timeout set */
    
    ......
} redisDb;

expires存储的过期时间是一个unix时间戳，以毫秒为单位，类型是long long，存放在dictEntry的联合体v中。代码中会使用宏 dictGetSignedIntegerVal 来读取过期时间。

typedef struct dictEntry {
    // 任意类型键
    void *key;
    // 存储的值
    union {
        void *val;
        uint64_t u64;
        int64_t s64;
        double d;
    } v;
    // 同一个桶中链表的下一个元素
    struct dictEntry *next;     /* Next entry in the same hash bucket. */
    void *metadata[];           /* An arbitrary number of bytes (starting at a
                                 * pointer-aligned address) of size as returned
                                 * by dictType's dictEntryMetadataBytes(). */
} dictEntry;


#define dictGetSignedIntegerVal(he) ((he)->v.s64)

目前过期时间可以通过EXPIRE 、PEXPIRE 、EXPIREAT 、PEXPIREAT以及SET+EX等命令设置，查看过期时间可以使用TTL、PTTL，而删除过期时间可以使用PERSIST命令，具体用法可以自行查阅相关资料。

对于过期键的判断其实也非常简单，从expires取到过期时间when后，直接与当前unix时间戳now进行对比，如果now > when，则判定为过期。

Redis的过期删除策略采用了惰性删除和定期删除两种方式：

1.惰性删除

Redis在每次读写key的同时会调用expireIfNeeded判断key是否过期，如果过期则从db和expires中删除key

int expireIfNeeded(redisDb *db, robj *key, int flags) {
    // 判断key是否过期
    if (!keyIsExpired(db,key)) return 0;

    /* If we are running in the context of a replica, instead of
     * evicting the expired key from the database, we return ASAP:
     * the replica key expiration is controlled by the master that will
     * send us synthesized DEL operations for expired keys. The
     * exception is when write operations are performed on writable
     * replicas.
     *
     * Still we try to return the right information to the caller,
     * that is, 0 if we think the key should be still valid, 1 if
     * we think the key is expired at this time.
     *
     * When replicating commands from the master, keys are never considered
     * expired. */
    // 处于主从模式
    if (server.masterhost != NULL) {
        // 跳过来自主节点的连接
        if (server.current_client == server.master) return 0;
        // 跳过设置非强制删除标志的操作
        if (!(flags & EXPIRE_FORCE_DELETE_EXPIRED)) return 1;
    }

    /* In some cases we're explicitly instructed to return an indication of a
     * missing key without actually deleting it, even on masters. */
    // 跳过设置阻止过期删除标志的操作
    if (flags & EXPIRE_AVOID_DELETE_EXPIRED)
        return 1;

    /* If clients are paused, we keep the current dataset constant,
     * but return to the client what we believe is the right state. Typically,
     * at the end of the pause we will properly expire the key OR we will
     * have failed over and the new primary will send us the expire. */
    // 检查客户端是否处于暂停状态，为保证数据一致性，跳过暂停状态的客户端
    if (checkClientPauseTimeoutAndReturnIfPaused()) return 1;

    /* Delete the key */
    // 删除key，并且扩散到aof、从节点
    deleteExpiredKeyAndPropagate(db,key);
    return 1;
}

2.定期删除

Redis的定期删除是通过服务器周期定时函数serverCron（后续介绍）调用数据库周期函数databasesCron实现的。

void databasesCron(void) {
    /* Expire keys by random sampling. Not required for slaves
     * as master will synthesize DELs for us. */
    if (server.active_expire_enabled) {
        if (iAmMaster()) {
            // 清理过期key
            activeExpireCycle(ACTIVE_EXPIRE_CYCLE_SLOW);
        } else {
            // 根据主节点下发的过期key进行清理
            expireSlaveKeys();
        }
    }
    ......
}

其中最关键的是真正执行删除动作的 activeExpireCycle 函数，需要注意的是该函数只在主节点下执行，从节点下执行的是 expireSlaveKeys 。activeExpireCycle 函数在满足一定条件的情况下，会依次遍历所有数据库的expires字典，并且遍历删除一些过期的 key。

void activeExpireCycle(int type) {
    /* Adjust the running parameters according to the configured expire
     * effort. The default effort is 1, and the maximum configurable effort
     * is 10. */
    unsigned long
    effort = server.active_expire_effort-1, /* Rescale from 0 to 9. */
    config_keys_per_loop = ACTIVE_EXPIRE_CYCLE_KEYS_PER_LOOP +
                           ACTIVE_EXPIRE_CYCLE_KEYS_PER_LOOP/4*effort, /* 每轮处理的键数量 */
    config_cycle_fast_duration = ACTIVE_EXPIRE_CYCLE_FAST_DURATION +
                                 ACTIVE_EXPIRE_CYCLE_FAST_DURATION/4*effort, /* 每轮快速处理的持续时间 */
    config_cycle_slow_time_perc = ACTIVE_EXPIRE_CYCLE_SLOW_TIME_PERC +  /* 每轮处理消耗的cpu百分比（最终换算成每次执行持续的微秒数） */
                                  2*effort,
    config_cycle_acceptable_stale = ACTIVE_EXPIRE_CYCLE_ACCEPTABLE_STALE- /* 每轮可接受的，可能过期的键的百分比 */
                                    effort;

    /* This function has some global state in order to continue the work
     * incrementally across calls. */
    // 记录当前检查的数据库索引
    static unsigned int current_db = 0; /* Next DB to test. */
    // 上次是否是超过时间限制导致的退出
    static int timelimit_exit = 0;      /* Time limit hit in previous call? */
    // 上次执行快速处理的时间
    static long long last_fast_cycle = 0; /* When last fast cycle ran. */

    int j, iteration = 0;
    int dbs_per_call = CRON_DBS_PER_CALL;
    // 记录处理的开始时间
    long long start = ustime(), timelimit, elapsed;

    /* When clients are paused the dataset should be static not just from the
     * POV of clients not being able to write, but also from the POV of
     * expires and evictions of keys not being performed. */
    if (checkClientPauseTimeoutAndReturnIfPaused()) return;

    if (type == ACTIVE_EXPIRE_CYCLE_FAST) {
        /* Don't start a fast cycle if the previous cycle did not exit
         * for time limit, unless the percentage of estimated stale keys is
         * too high. Also never repeat a fast cycle for the same period
         * as the fast cycle total duration itself. */
        // 如果上次不是时间限制导致的退出，并且可能过期的键的百分比小于每轮可接受的百分比，就退出
        if (!timelimit_exit &&
            server.stat_expired_stale_perc < config_cycle_acceptable_stale)
            return;

        /* 不连续重复执行快速处理，所以当前开始时间点小于上次执行快速处理时间点开始执行两轮的时长时，直接退出
             |  config_cycle_fast_duration   |  config_cycle_fast_duration   |
             +-------------------------------+-------------------------------+
             |                                                |
         last_fast_cycle                                    start
        */
        if (start < last_fast_cycle + (long long)config_cycle_fast_duration*2)
            return;

        // 更新时间点
        last_fast_cycle = start;
    }

    /* We usually should test CRON_DBS_PER_CALL per iteration, with
     * two exceptions:
     *
     * 1) Don't test more DBs than we have.
     * 2) If last time we hit the time limit, we want to scan all DBs
     * in this iteration, as there is work to do in some DB and we don't want
     * expired keys to use memory for too much time. */
    if (dbs_per_call > server.dbnum || timelimit_exit)
        dbs_per_call = server.dbnum;

    /* We can use at max 'config_cycle_slow_time_perc' percentage of CPU
     * time per iteration. Since this function gets called with a frequency of
     * server.hz times per second, the following is the max amount of
     * microseconds we can spend in this function. */
    // 把cpu百分比转换成可以执行占用的微秒时长
    timelimit = config_cycle_slow_time_perc*1000000/server.hz/100;
    timelimit_exit = 0;
    if (timelimit <= 0) timelimit = 1;

    // 对于快速处理，执行时长设为 config_cycle_fast_duration
    if (type == ACTIVE_EXPIRE_CYCLE_FAST)
        timelimit = config_cycle_fast_duration; /* in microseconds. */

    /* Accumulate some global stats as we expire keys, to have some idea
     * about the number of keys that are already logically expired, but still
     * existing inside the database. */
    long total_sampled = 0;
    long total_expired = 0;

    /* Sanity: There can't be any pending commands to propagate when
     * we're in cron */
    serverAssert(server.also_propagate.numops == 0);
    server.core_propagates = 1;
    // 不使用事务来扩散命令
    server.propagate_no_multi = 1;

    for (j = 0; j < dbs_per_call && timelimit_exit == 0; j++) {
        /* Expired and checked in a single loop. */
        unsigned long expired, sampled;
        // 依次遍历所有数据库
        redisDb *db = server.db+(current_db % server.dbnum);

        /* Increment the DB now so we are sure if we run out of time
         * in the current DB we'll restart from the next. This allows to
         * distribute the time evenly across DBs. */
        current_db++;

        /* Continue to expire if at the end of the cycle there are still
         * a big percentage of keys to expire, compared to the number of keys
         * we scanned. The percentage, stored in config_cycle_acceptable_stale
         * is not fixed, but depends on the Redis configured "expire effort". */
        do {
            unsigned long num, slots;
            long long now, ttl_sum;
            int ttl_samples;
            iteration++;

            /* If there is nothing to expire try next DB ASAP. */
            // 没有超时key直接处理下一个数据库
            if ((num = dictSize(db->expires)) == 0) {
                db->avg_ttl = 0;
                break;
            }
            // 过期表所有的槽数量
            slots = dictSlots(db->expires);
            now = mstime();

            /* When there are less than 1% filled slots, sampling the key
             * space is expensive, so stop here waiting for better times...
             * The dictionary will be resized asap. */
            // 槽的使用率不足1%，则直接处理下一个数据库，并且尽快缩减槽的数量
            if (slots > DICT_HT_INITIAL_SIZE &&
                (num*100/slots < 1)) break;

            /* The main collection cycle. Sample random keys among keys
             * with an expire set, checking for expired ones. */
            expired = 0;
            sampled = 0;
            ttl_sum = 0;
            ttl_samples = 0;

            // 最多处理的key数量
            if (num > config_keys_per_loop)
                num = config_keys_per_loop;

            /* Here we access the low level representation of the hash table
             * for speed concerns: this makes this code coupled with dict.c,
             * but it hardly changed in ten years.
             *
             * Note that certain places of the hash table may be empty,
             * so we want also a stop condition about the number of
             * buckets that we scanned. However scanning for free buckets
             * is very fast: we are in the cache line scanning a sequential
             * array of NULL pointers, so we can scan a lot more buckets
             * than keys in the same time. */
            long max_buckets = num*20;
            long checked_buckets = 0;

            while (sampled < num && checked_buckets < max_buckets) {
                for (int table = 0; table < 2; table++) {
                    // 需要处理字典中的第二个哈希表时，如果字典没在执行rehash，则直接跳过不需要处理
                    if (table == 1 && !dictIsRehashing(db->expires)) break;

                    // 上次处理到的索引位置
                    unsigned long idx = db->expires_cursor;
                    idx &= DICTHT_SIZE_MASK(db->expires->ht_size_exp[table]);
                    dictEntry *de = db->expires->ht_table[table][idx];
                    long long ttl;

                    /* Scan the current bucket of the current table. */
                    checked_buckets++;
                    while(de) {
                        // 遍历桶中的节点链表
                        /* Get the next entry now since this entry may get
                         * deleted. */
                        dictEntry *e = de;
                        de = de->next;

                        ttl = dictGetSignedIntegerVal(e)-now;
                        // 删除过期键
                        if (activeExpireCycleTryExpire(db,e,now)) expired++;
                        if (ttl > 0) {
                            /* We want the average TTL of keys yet
                             * not expired. */
                            ttl_sum += ttl;
                            ttl_samples++;
                        }
                        sampled++;
                    }
                }
                db->expires_cursor++;
            }
            // 统计采样个数和超时个数
            total_expired += expired;
            total_sampled += sampled;

            /* Update the average TTL stats for this database. */
            if (ttl_samples) {
                long long avg_ttl = ttl_sum/ttl_samples;

                /* Do a simple running average with a few samples.
                 * We just use the current estimate with a weight of 2%
                 * and the previous estimate with a weight of 98%. */
                // 计算平滑ttl值 = old_ttl * 98% + new_ttl * 2%
                if (db->avg_ttl == 0) db->avg_ttl = avg_ttl;
                db->avg_ttl = (db->avg_ttl/50)*49 + (avg_ttl/50);
            }

            /* We can't block forever here even if there are many keys to
             * expire. So after a given amount of milliseconds return to the
             * caller waiting for the other active expire cycle. */
            if ((iteration & 0xf) == 0) { /* check once every 16 iterations. */
                // 16次迭代检查一次是否超出执行时间上限
                elapsed = ustime()-start;
                if (elapsed > timelimit) {
                    timelimit_exit = 1;
                    server.stat_expired_time_cap_reached_count++;
                    break;
                }
            }
            /* We don't repeat the cycle for the current database if there are
             * an acceptable amount of stale keys (logically expired but yet
             * not reclaimed). */
            // 如果采样为空或者当前采样中过期的百分比大于 config_cycle_acceptable_stale
        } while (sampled == 0 ||
                 (expired*100/sampled) > config_cycle_acceptable_stale);
    }

    serverAssert(server.core_propagates); /* This function should not be re-entrant */

    /* Propagate all DELs */
    // 向aof文件以及从节点扩散删除命令
    propagatePendingCommands();

    server.core_propagates = 0;
    server.propagate_no_multi = 0;

    // 累计执行耗时（微秒）
    elapsed = ustime()-start;
    server.stat_expire_cycle_time_used += elapsed;
    latencyAddSampleIfNeeded("expire-cycle",elapsed/1000);

    /* Update our estimate of keys existing but yet to be expired.
     * Running average with this sample accounting for 5%. */
    double current_perc;
    if (total_sampled) {
        current_perc = (double)total_expired/total_sampled;
    } else
        current_perc = 0;
    // 更新预估key过期百分比 = old_perc * 95% + new_perc * 5%
    server.stat_expired_stale_perc = (current_perc*0.05)+
                                     (server.stat_expired_stale_perc*0.95);
}

expireSlaveKeys 比较特殊，它是从 slaveKeysWithExpire 表中取出可能过期的键进行过期判断的。

void expireSlaveKeys(void) {
    // 表中没有过期key的信息，直接退出
    if (slaveKeysWithExpire == NULL ||
        dictSize(slaveKeysWithExpire) == 0) return;

    int cycles = 0, noexpire = 0;
    mstime_t start = mstime();
    while(1) {
        // 随机获取一个节点
        dictEntry *de = dictGetRandomKey(slaveKeysWithExpire);
        sds keyname = dictGetKey(de);
        // dbids 是一个以dbid为索引的bitmap，用于标识对应的db上是否有这个key
        uint64_t dbids = dictGetUnsignedIntegerVal(de);
        uint64_t new_dbids = 0;

        /* Check the key against every database corresponding to the
         * bits set in the value bitmap. */
        int dbid = 0;
        while(dbids && dbid < server.dbnum) {
            // 当前最低位是1, 标识dbid对应的数据库可能有这个key
            if ((dbids & 1) != 0) {
                redisDb *db = server.db+dbid;
                dictEntry *expire = dictFind(db->expires,keyname);
                int expired = 0;

                // 处理key超时删除
                if (expire &&
                    activeExpireCycleTryExpire(server.db+dbid,expire,start))
                {
                    expired = 1;
                }

                /* If the key was not expired in this DB, we need to set the
                 * corresponding bit in the new bitmap we set as value.
                 * At the end of the loop if the bitmap is zero, it means we
                 * no longer need to keep track of this key. */
                // 如果dbid对应db上的key存在并且还没超时，就把 new_dbids 相应位置1
                if (expire && !expired) {
                    noexpire++;
                    new_dbids |= (uint64_t)1 << dbid;
                }
            }
            dbid++;
            // 更新最低位对应新的dbid
            dbids >>= 1;
        }

        /* Set the new bitmap as value of the key, in the dictionary
         * of keys with an expire set directly in the writable slave. Otherwise
         * if the bitmap is zero, we no longer need to keep track of it. */
        // 如果 new_dbids 不为0，就用它更新节点中的bitmap；为0表示在过期表中删除完毕，可以删除这个key了
        if (new_dbids)
            dictSetUnsignedIntegerVal(de,new_dbids);
        else
            dictDelete(slaveKeysWithExpire,keyname);

        /* Stop conditions: found 3 keys we can't expire in a row or
         * time limit was reached. */
        cycles++;
        // 累计3次检测未超时退出循环
        if (noexpire > 3) break;
        // 循环超过64次并且超过1ms退出循环
        if ((cycles % 64) == 0 && mstime()-start > 1) break;
        // 从节点key过期表为空退出循环
        if (dictSize(slaveKeysWithExpire) == 0) break;
    }
}

需要注意的是，activeExpireCycle 在 beforeSleep 函数中也会执行一次，前提是没有开启主从。

Redis源码解析文章被收录于专栏

基于redis7.x版本的源码分析