字符串处理 String Processing
字符串处理 String Processing
字符串处理是算法中的基础问题,涉及字符串匹配、编辑、转换等多个方面。本指南涵盖常见的字符串处理算法和技巧。
字符串匹配算法 String Matching Algorithms
1. KMP算法 Knuth-Morris-Pratt
def buildNext(pattern: str) -> List[int]:
"""构建KMP算法的next数组"""
next = [0] * len(pattern)
j = 0
for i in range(1, len(pattern)):
while j > 0 and pattern[i] != pattern[j]:
j = next[j-1]
if pattern[i] == pattern[j]:
j += 1
next[i] = j
return next
def kmpSearch(text: str, pattern: str) -> List[int]:
"""KMP查找所有匹配位置"""
if not pattern:
return []
next = buildNext(pattern)
matches = []
j = 0
for i in range(len(text)):
while j > 0 and text[i] != pattern[j]:
j = next[j-1]
if text[i] == pattern[j]:
j += 1
if j == len(pattern):
matches.append(i - j + 1)
j = next[j-1]
return matches
2. Rabin-Karp算法
def rabinKarp(text: str, pattern: str) -> List[int]:
"""Rabin-Karp字符串匹配"""
if not pattern:
return []
# 选择一个合适的进制和模数
base = 256
mod = 1000000007
# 计算pattern的哈希值
pattern_hash = 0
for c in pattern:
pattern_hash = (pattern_hash * base + ord(c)) % mod
# 计算文本串中pattern长度的滑动窗口哈希值
n, m = len(text), len(pattern)
text_hash = 0
highest_pow = pow(base, m-1, mod)
matches = []
for i in range(n):
# 添加新字符到哈希值
text_hash = (text_hash * base + ord(text[i])) % mod
# 如果窗口未达到pattern长度,继续
if i < m-1:
continue
# 移除最左边的字符
if i >= m:
text_hash = (text_hash - ord(text[i-m]) * highest_pow) % mod
# 哈希值匹配时,进行字符串比较
if text_hash == pattern_hash and text[i-m+1:i+1] == pattern:
matches.append(i-m+1)
return matches
3. Boyer-Moore算法
def buildBadCharTable(pattern: str) -> Dict[str, int]:
"""构建坏字符规则表"""
table = {}
for i in range(len(pattern)-1):
table[pattern[i]] = len(pattern) - 1 - i
return table
def boyerMoore(text: str, pattern: str) -> List[int]:
"""Boyer-Moore字符串匹配"""
bad_char = buildBadCharTable(pattern)
matches = []
i = len(pattern) - 1
while i < len(text):
skip = 0
for j in range(len(pattern)-1, -1, -1):
if text[i-len(pattern)+j+1] != pattern[j]:
skip = bad_char.get(text[i], len(pattern))
break
if skip == 0:
matches.append(i-len(pattern)+1)
skip = 1
i += skip
return matches
正则表达式 Regular Expression
1. 基本正则匹配
def isMatch(text: str, pattern: str) -> bool:
"""简单正则表达式匹配(支持.和*)"""
if not pattern:
return not text
first_match = bool(text) and pattern[0] in {text[0], '.'}
if len(pattern) >= 2 and pattern[1] == '*':
return (isMatch(text, pattern[2:]) or
(first_match and isMatch(text[1:], pattern)))
else:
return first_match and isMatch(text[1:], pattern[1:])
2. 动态规划优化
def isMatchDP(text: str, pattern: str) -> bool:
"""使用动态规划的正则表达式匹配"""
dp = [[False] * (len(pattern) + 1) for _ in range(len(text) + 1)]
dp[-1][-1] = True
for i in range(len(text), -1, -1):
for j in range(len(pattern) - 1, -1, -1):
first_match = i < len(text) and pattern[j] in {text[i], '.'}
if j+1 < len(pattern) and pattern[j+1] == '*':
dp[i][j] = dp[i][j+2] or (first_match and dp[i+1][j])
else:
dp[i][j] = first_match and dp[i+1][j+1]
return dp[0][0]
字符串哈希 String Hashing
1. 单哈希函数
def stringHash(s: str) -> int:
"""计算字符串的哈希值"""
base = 131
mod = 1000000007
hash_value = 0
for c in s:
hash_value = (hash_value * base + ord(c)) % mod
return hash_value
2. 双哈希函数(防碰撞)
def doubleHash(s: str) -> Tuple[int, int]:
"""使用两个不同的哈希函数计算哈希值"""
base1, base2 = 131, 13331
mod1, mod2 = 1000000007, 1000000009
hash1 = hash2 = 0
for c in s:
hash1 = (hash1 * base1 + ord(c)) % mod1
hash2 = (hash2 * base2 + ord(c)) % mod2
return (hash1, hash2)
后缀数组与树 Suffix Array and Tree
1. 后缀数组构建
def buildSuffixArray(s: str) -> List[int]:
"""构建后缀数组(简化版)"""
suffixes = [(s[i:], i) for i in range(len(s))]
suffixes.sort()
return [index for _, index in suffixes]
2. 最长公共前缀数组
def buildLCP(s: str, suffix_array: List[int]) -> List[int]:
"""构建最长公共前缀数组"""
n = len(s)
rank = [0] * n
for i in range(n):
rank[suffix_array[i]] = i
lcp = [0] * (n-1)
k = 0
for i in range(n):
if rank[i] == n-1:
k = 0
continue
j = suffix_array[rank[i] + 1]
while i+k < n and j+k < n and s[i+k] == s[j+k]:
k += 1
lcp[rank[i]] = k
if k > 0:
k -= 1
return lcp
经典题目 Classic Problems
1. 最长回文子串 (5. Longest Palindromic Substring)
def longestPalindrome(s: str) -> str:
"""Manacher算法求最长回文子串"""
# 预处理字符串
t = '#' + '#'.join(s) + '#'
n = len(t)
p = [0] * n # 回文半径数组
center = right = 0
max_len = max_center = 0
for i in range(n):
if i < right:
p[i] = min(right - i, p[2 * center - i])
# 中心扩展
while i-p[i]-1 >= 0 and i+p[i]+1 < n and t[i-p[i]-1] == t[i+p[i]+1]:
p[i] += 1
# 更新中心和右边界
if i + p[i] > right:
center, right = i, i + p[i]
# 更新最长回文子串
if p[i] > max_len:
max_len = p[i]
max_center = i
start = (max_center - max_len) // 2
return s[start:start + max_len]
2. 字符串编辑距离 (72. Edit Distance)
def minDistance(word1: str, word2: str) -> int:
"""计算两个字符串的编辑距离"""
m, n = len(word1), len(word2)
dp = [[0] * (n + 1) for _ in range(m + 1)]
# 初始化边界
for i in range(m + 1):
dp[i][0] = i
for j in range(n + 1):
dp[0][j] = j
# 动态规划填表
for i in range(1, m + 1):
for j in range(1, n + 1):
if word1[i-1] == word2[j-1]:
dp[i][j] = dp[i-1][j-1]
else:
dp[i][j] = min(dp[i-1][j], dp[i][j-1], dp[i-1][j-1]) + 1
return dp[m][n]
实践建议 Practice Tips
-
字符串匹配算法选择
- KMP:精确匹配,线性时间复杂度
- Rabin-Karp:多模式匹配
- Boyer-Moore:实际应用中较快
-
正则表达式
- 理解基本语法
- 使用动态规划优化
- 注意特殊字符处理
-
哈希技巧
- 选择合适的基数和模数
- 考虑使用双哈希防碰撞
- 处理溢出问题
-
后缀结构
- 理解后缀数组和树的特点
- 掌握常见应用场景
- 注意空间效率
常见陷阱 Common Pitfalls
-
边界处理
- 空字符串
- 单字符串
- 特殊字符
-
效率问题
- 字符串拼接效率
- 子串生成效率
- 空间使用效率
-
编码问题
- 字符集处理
- Unicode处理
- 大小写敏感性
-
算法选择
- 场景不适合的算法
- 复杂度过高的实现
- 内存使用过大