Skip to main content

Finite State Transducer

Note
KIGA
Author
KIGA
This is a personal blog, intended for sharing.
Table of Contents

FST
#

FST是一种有限状态自动机,可以用来存储和快速查找字符串集合。

FST有两种类型:前缀FST和后缀FST,它们分别用于存储前缀和后缀字符串集合。在URL去重中,可以使用前缀FST来存储已经爬取过的URL。

Golang
#

实现FST前缀树(Trie)

package main

import (
	"fmt"
	"sync"
)

type Node struct {
	isWord bool
	next   map[rune]*Node
	prev   map[rune]*Node
}

type Trie struct {
	root *Node
	mu   sync.RWMutex
}

func NewTrie() *Trie {
	return &Trie{
		root: &Node{next: make(map[rune]*Node),
			prev: make(map[rune]*Node),
		},
	}
}

func (t *Trie) Insert(word string) {
	t.mu.Lock()
	defer t.mu.Unlock()

	node := t.root
	for _, ch := range word {
		if node.next[ch] == nil {
			node.next[ch] = &Node{
				next: make(map[rune]*Node),
				prev: make(map[rune]*Node),
			}
		}
		node = node.next[ch]
	}
	node.isWord = true

	node = t.root
	for i := len(word) - 1; i >= 0; i-- {
		ch := rune(word[i])
		if node.prev[ch] == nil {
			node.prev[ch] = &Node{
				next: make(map[rune]*Node),
				prev: make(map[rune]*Node),
			}
		}
		node = node.prev[ch]
	}
	node.isWord = true
}

func (t *Trie) SearchPrefix(prefix string) bool {
	t.mu.RLock()
	defer t.mu.RUnlock()

	node := t.root
	for _, ch := range prefix {
		node = node.next[ch]
		if node == nil {
			return false
		}
	}
	return true
}

func (t *Trie) SearchSuffix(suffix string) bool {
	t.mu.RLock()
	defer t.mu.RUnlock()

	node := t.root
	for i := len(suffix) - 1; i >= 0; i-- {
		ch := rune(suffix[i])
		node = node.prev[ch]
		if node == nil {
			return false
		}
	}
	return true
}

func main() {
	trie := NewTrie()
	trie.Insert("https://www.example.com/")
	fmt.Println(trie.SearchPrefix("https://www.example.com/about")) // false
	fmt.Println(trie.SearchPrefix("https://www.example.com"))       // true
	fmt.Println(trie.SearchSuffix("https://www.example.com/post"))  // false
	fmt.Println(trie.SearchPrefix("https://www.example1.com/"))     // false
	fmt.Println(trie.SearchSuffix("https://www.example.com/"))      // true
}

Rust
#

定义简单的Node结构表示FST节点,使用HashMap存储节点。(示例为全匹配)

use std::collections::HashMap;

#[derive(Debug)]
struct Node {
    is_end: bool,
    children: HashMap<char, Node>,
}

impl Node {
    fn new() -> Self {
        Node {
            is_end: false,
            children: HashMap::new(),
        }
    }
}

#[derive(Debug)]
pub struct FST {
    root: Node,
}

impl FST {
    pub fn new() -> Self {
        FST { root: Node::new() }
    }

    pub fn insert(&mut self, word: &str) {
        let mut current = &mut self.root;
        for ch in word.chars() {
            current = current.children.entry(ch).or_insert(Node::new());
        }
        current.is_end = true;
    }

    pub fn search(&self, word: &str) -> bool {
        let mut current = &self.root;
        for ch in word.chars() {
            match current.children.get(&ch) {
                Some(next) => current = next,
                None => return false,
            }
        }
        current.is_end
    }
}

pub fn fst_test() {
    let mut fst = FST::new();
    fst.insert("hello");
    fst.insert("world");
    println!("{:?}", fst);
    println!("{}", fst.search("hello1"));
    println!("{}", fst.search("world"));
    println!("{}", fst.search("hell"));
    println!("{}", fst.search("worl"));
}