FST #
FST是一种有限状态自动机,可以用来存储和快速查找字符串集合。
FST有两种类型:前缀FST和后缀FST,它们分别用于存储前缀和后缀字符串集合。在URL去重中,可以使用前缀FST来存储已经爬取过的URL。
Golang #
实现FST前缀树(Trie)
package main
import (
"fmt"
"sync"
)
type Node struct {
isWord bool
next map[rune]*Node
prev map[rune]*Node
}
type Trie struct {
root *Node
mu sync.RWMutex
}
func NewTrie() *Trie {
return &Trie{
root: &Node{next: make(map[rune]*Node),
prev: make(map[rune]*Node),
},
}
}
func (t *Trie) Insert(word string) {
t.mu.Lock()
defer t.mu.Unlock()
node := t.root
for _, ch := range word {
if node.next[ch] == nil {
node.next[ch] = &Node{
next: make(map[rune]*Node),
prev: make(map[rune]*Node),
}
}
node = node.next[ch]
}
node.isWord = true
node = t.root
for i := len(word) - 1; i >= 0; i-- {
ch := rune(word[i])
if node.prev[ch] == nil {
node.prev[ch] = &Node{
next: make(map[rune]*Node),
prev: make(map[rune]*Node),
}
}
node = node.prev[ch]
}
node.isWord = true
}
func (t *Trie) SearchPrefix(prefix string) bool {
t.mu.RLock()
defer t.mu.RUnlock()
node := t.root
for _, ch := range prefix {
node = node.next[ch]
if node == nil {
return false
}
}
return true
}
func (t *Trie) SearchSuffix(suffix string) bool {
t.mu.RLock()
defer t.mu.RUnlock()
node := t.root
for i := len(suffix) - 1; i >= 0; i-- {
ch := rune(suffix[i])
node = node.prev[ch]
if node == nil {
return false
}
}
return true
}
func main() {
trie := NewTrie()
trie.Insert("https://www.example.com/")
fmt.Println(trie.SearchPrefix("https://www.example.com/about")) // false
fmt.Println(trie.SearchPrefix("https://www.example.com")) // true
fmt.Println(trie.SearchSuffix("https://www.example.com/post")) // false
fmt.Println(trie.SearchPrefix("https://www.example1.com/")) // false
fmt.Println(trie.SearchSuffix("https://www.example.com/")) // true
}
Rust #
定义简单的Node结构表示FST节点,使用HashMap存储节点。(示例为全匹配)
use std::collections::HashMap;
#[derive(Debug)]
struct Node {
is_end: bool,
children: HashMap<char, Node>,
}
impl Node {
fn new() -> Self {
Node {
is_end: false,
children: HashMap::new(),
}
}
}
#[derive(Debug)]
pub struct FST {
root: Node,
}
impl FST {
pub fn new() -> Self {
FST { root: Node::new() }
}
pub fn insert(&mut self, word: &str) {
let mut current = &mut self.root;
for ch in word.chars() {
current = current.children.entry(ch).or_insert(Node::new());
}
current.is_end = true;
}
pub fn search(&self, word: &str) -> bool {
let mut current = &self.root;
for ch in word.chars() {
match current.children.get(&ch) {
Some(next) => current = next,
None => return false,
}
}
current.is_end
}
}
pub fn fst_test() {
let mut fst = FST::new();
fst.insert("hello");
fst.insert("world");
println!("{:?}", fst);
println!("{}", fst.search("hello1"));
println!("{}", fst.search("world"));
println!("{}", fst.search("hell"));
println!("{}", fst.search("worl"));
}