数据结构之实现动态散列表

学习极客时间《数据结构与算法之美》散列表的应用，使用 golang 实现了一个使用链表法的动态扩容散列表。

散列表就是利用数组支持随机访问的特性，将本需要遍历才能寻找到数据的方式通过一个哈希函数映射到数组下标，这样就可以快速找到数据位置，具体过程如下图：

对于一个健壮的散列表算法来说，其中至关重要的几个关键点：

散列函数的选择
- 对于静态数据来说，由于事先知道数据内容，可以通过数据内容进行制定
- 对于动态数据来说，需要设计好散列函数、装载因子、扩容方式等，以降低散列冲突发生的概率，避免查询效率的下降
装载因子：通常而言装载因子设计为0.75，这属于一个经验值，超过装载因子后底层数组容量加倍扩容
使用链表法还是开放寻址法
- 链表法：数据发生散列冲突后，将元素追加到链表尾部，适合存储大数据量的散列表
- 开放寻址法：适合数据量较小，装载因子较小的情况
如何处理扩容
- 扩容面临的问题：当需要扩容时，所有数据都需要重新计算散列值，如果在插入某个数据之后超过了装载因子，此时全部进行重新散列，势必要消耗大量的时间，导致性能急剧下降
- 解决：可以将重新散列操作均摊至每一次插入操作中，将时间复杂度从极端的O(n)降低为O(1)，在需要扩容时并不集中进行散列操作，而是申请一个新的数组，后续的每次插入操作都将原数组中的一个元素重新散列到新数组当中，实现均摊复杂度

散列表经常和链表同时使用，因为链表的特点是无法实现随机访问，寻找一个元素需要遍历整个链表，而散列表很好的解决了这个问题，所以二者经常配合实现一些经典的算法，例如LRU（最近最少使用）缓存淘汰算法。

下面是使用Go语言实现的链表法动态散列表，仅供参考：

package main

import (
	"fmt"
	"math"
	"math/rand"
	"time"
)

func main() {
	hm := hashMap{
		capacity:   10,  // 初始容量
		loadFactor: 0.8, // 装载因子
	}
	hm.new()

	// 生成测试数据
	testKeyCount := 20
	keys := []string{}
	for i := 0; i < testKeyCount; i++ {
		k := GenRandStr(4)
		keys = append(keys, k)
		hm.setHashMap(k, point{k})
	}

	// --------- 打印容器内数据 ---------
	fmt.Println("====旧容器====")
	hm.printContainer(hm.container)
	fmt.Println("====新容器====")
	hm.printContainer(hm.containerExpansion)

	fmt.Println("--------测试获取元素--------")
	for i := 0; i < testKeyCount; i++ {
		fmt.Println(keys[i], hm.getHashMap(keys[i]))
	}

	fmt.Println("--------测试删除元素--------")
	for i := 0; i < testKeyCount; i++ {
		fmt.Println(keys[i], hm.delHashMap(keys[i]))
	}

	fmt.Println("--------测试删除不存在的元素--------")
	fmt.Println(hm.delHashMap("test"))
}

func GenRandStr(l int) string {
	str := "abcdefghijklmnopqrstuvwxyz"
	bytes := []byte(str)
	result := []byte{}
	r := rand.New(rand.NewSource(time.Now().UnixNano()))
	for i := 0; i < l; i++ {
		result = append(result, bytes[r.Intn(len(bytes))])
	}
	return string(result)
}

// point 散列容器表value
type point struct {
	s string
}

// hashNode 散列表容器结构中真实存储的数据，包含 kv 值
type hashNode struct {
	p    interface{}
	key  string
	next *hashNode
}

// hashMap 结构体，用于代表一个散列表
type hashMap struct {
	loadFactor              float32     // 装填因子
	capacity                int         // 容器容量（当需要转移数据时称为旧容器）
	container               []*hashNode // 旧容器
	containerCount          int         // 旧容器已经填充的数据量
	capacityExpansion       int         // 容器容量（当需要转移数据时称为新容器）
	containerExpansion      []*hashNode // 新容器
	containerCountExpansion int         // 新容器已经填充的数据量
}

// printContainer 打印容器内容
func (hm *hashMap) printContainer(container []*hashNode) {
	var tempNode *hashNode
	for _, v := range container {
		if v != nil {
			tempNode = v
			for {
				fmt.Printf("%s", tempNode.key)
				if tempNode.next == nil {
					break
				}
				fmt.Printf(" -> ")
				tempNode = tempNode.next
			}
			fmt.Println()
		} else {
			fmt.Println(v)
		}
	}
}

// hm 散列函数
func (hm *hashMap) hm(s string, useCapacity bool) int {
	var sLen float64 = float64(len(s) - 1)
	asciiSum := 0
	// 散列规则：将字符串中每个字母的 ASCII 码值“进位”相加，然后跟容器大小取模
	for _, asciiCode := range s {
		asciiSum += int(asciiCode-'a') * int(math.Pow(26, sLen))
		sLen--
	}
	// 使用旧容器
	if useCapacity {
		return asciiSum % hm.capacity
	}
	// 存在新容器，就使用新容器
	if hm.capacityExpansion != 0 {
		return asciiSum % hm.capacityExpansion
	}
	return asciiSum % hm.capacity
}

// new 初始化容器
func (hm *hashMap) new() {
	for i := 0; i < hm.capacity; i++ {
		hm.container = append(hm.container, nil)
	}
}

func (hm *hashMap) insertLikedList(p *hashNode, key string, value interface{}) {
	for {
		// key 存在，直接替换
		if p.key == key {
			p.p = value
			break
		}
		if p.next != nil {
			p = p.next
		} else {
			p.next = &hashNode{value, key, nil}
			break
		}
	}
}

// insertHashMap 向容器中插入数据
func (hm *hashMap) insertContainer(key string, value interface{}) {
	index := hm.hm(key, false)
	if p := hm.container[index]; p != nil {
		hm.insertLikedList(p, key, value)
	} else {
		// 容器空位，直接放入，并且增加容器占用量
		hm.container[index] = &hashNode{value, key, nil}
		hm.containerCount++
	}
}

// insertHashMapExpansion 向新容器中插入数据
func (hm *hashMap) insertContainerExpansion(key string, value interface{}) {
	index := hm.hm(key, false)
	if p := hm.containerExpansion[index]; p != nil {
		hm.insertLikedList(p, key, value)
	} else {
		// 容器空位，直接放入，并且增加容器占用量
		hm.containerExpansion[index] = &hashNode{value, key, nil}
		hm.containerCountExpansion++
	}
}

// newExpansion 初始化新容器
func (hm *hashMap) newExpansion() {
	hm.containerCountExpansion = 0
	hm.capacityExpansion = hm.capacity * 2
	hm.containerExpansion = []*hashNode{}
	for i := 0; i < hm.capacityExpansion; i++ {
		hm.containerExpansion = append(hm.containerExpansion, nil)
	}
}

// moveElement 从旧容器向新容器移动一个元素
func (hm *hashMap) moveElement() {
	// 从旧容器中获取一个元素，插入到新容器中
	var tempNode *hashNode
	for i := 0; i < hm.capacity; i++ {
		if temp := hm.container[i]; temp != nil {
			tempNode = temp
			if temp.next == nil {
				hm.containerCount--
			}
			hm.container[i] = temp.next
			break
		}
	}
	hm.insertContainerExpansion(tempNode.key, tempNode.p)
	// 如果旧容器中元素被移动完成，则将新容器覆盖旧容器，完成一次完整扩容操作
	if hm.containerCount == 0 {
		hm.capacity = hm.capacityExpansion
		hm.container = hm.containerExpansion
		hm.containerCount = hm.containerCountExpansion
		hm.capacityExpansion = 0
		hm.containerCountExpansion = 0
		hm.containerExpansion = nil
	}
}

// setHashMap 设置散列表元素
func (hm *hashMap) setHashMap(key string, value interface{}) {
	// 元素填充量超过了装填因子，此时需要创建一个新的容器进行扩容
	if hm.capacityExpansion == 0 && hm.containerCount*100/hm.capacity >= int(hm.loadFactor*100) {
		hm.newExpansion()
	}

	// 有新容器，说明处于扩容阶段，向新容器中插入数据并同时移动旧容器的数据到新容器中
	if hm.capacityExpansion != 0 {
		hm.insertContainerExpansion(key, value)
		hm.moveElement()
	} else {
		// 不存在新容器则将元素插入旧容器
		hm.insertContainer(key, value)
	}
}

// findLinkedList 在链表中寻找元素
func (hm *hashMap) findLinkedList(key string, p *hashNode) interface{} {
	var ret interface{}
	for {
		if p.key == key {
			ret = p.p
		}
		if p.next != nil {
			p = p.next
		} else {
			break
		}
	}
	return ret
}

// getHashMap 获取散列表元素
func (hm *hashMap) getHashMap(key string) interface{} {
	// 查找新容器中元素是否存在
	var ret interface{}
	if hm.capacityExpansion != 0 {
		index := hm.hm(key, false)
		if p := hm.containerExpansion[index]; p != nil {
			ret = hm.findLinkedList(key, p)
		}
	}
	// 查找旧容器中元素是否存在
	index := hm.hm(key, true)
	if p := hm.container[index]; ret == nil && p != nil {
		ret = hm.findLinkedList(key, p)
	}
	return ret
}

// 删除散列表元素，删除成功返回 true，失败返回 false
func (hm *hashMap) delHashMap(key string) bool {
	// 查找新容器
	if hm.capacityExpansion != 0 {
		index := hm.hm(key, false)
		if p := hm.containerExpansion[index]; p != nil {
			var prevNode *hashNode
			for {
				// 找到元素，删除链表节点
				if p.key == key {
					if prevNode != nil {
						// 链表中还有元素
						prevNode.next = p.next
					} else {
						if p.next != nil {
							hm.containerExpansion[index] = p.next
						} else {
							// 链表已空，链表填充的数据量记录减少
							hm.containerExpansion[index] = nil
							hm.containerCountExpansion--
						}
					}
					return true
				}
				if p.next != nil {
					prevNode = p
					p = p.next
				} else {
					break
				}
			}
		}
	}
	// 查找旧容器
	index := hm.hm(key, true)
	if p := hm.container[index]; hm.container[index] != nil {
		var prevNode *hashNode
		for {
			// 找到元素，删除链表节点
			if p.key == key {
				if prevNode != nil {
					// 链表中还有元素
					prevNode.next = p.next
				} else {
					if p.next != nil {
						hm.container[index] = p.next
					} else {
						// 链表已空，链表填充的数据量记录减少
						hm.container[index] = nil
						hm.containerCount--
					}
				}
				return true
			}
			if p.next != nil {
				prevNode = p
				p = p.next
			} else {
				return false
			}
		}
	}
	return false
}

Lipsum

数据结构之实现动态散列表

均摊扩容演示：