Skip to main content
  1. Posts/

Finite State Transducer

365 words·2 mins
Note
Table of Contents

FST
#

FST是一种有限状态自动机,可以用来存储和快速查找字符串集合。

FST有两种类型:前缀FST和后缀FST,它们分别用于存储前缀和后缀字符串集合。在URL去重中,可以使用前缀FST来存储已经爬取过的URL。

Golang
#

实现FST前缀树(Trie)

package main

import (
	"fmt"
	"sync"
)

type Node struct {
	isWord bool
	next   map[rune]*Node
	prev   map[rune]*Node
}

type Trie struct {
	root *Node
	mu   sync.RWMutex
}

func NewTrie() *Trie {
	return &Trie{
		root: &Node{next: make(map[rune]*Node),
			prev: make(map[rune]*Node),
		},
	}
}

func (t *Trie) Insert(word string) {
	t.mu.Lock()
	defer t.mu.Unlock()

	node := t.root
	for _, ch := range word {
		if node.next[ch] == nil {
			node.next[ch] = &Node{
				next: make(map[rune]*Node),
				prev: make(map[rune]*Node),
			}
		}
		node = node.next[ch]
	}
	node.isWord = true

	node = t.root
	for i := len(word) - 1; i >= 0; i-- {
		ch := rune(word[i])
		if node.prev[ch] == nil {
			node.prev[ch] = &Node{
				next: make(map[rune]*Node),
				prev: make(map[rune]*Node),
			}
		}
		node = node.prev[ch]
	}
	node.isWord = true
}

func (t *Trie) SearchPrefix(prefix string) bool {
	t.mu.RLock()
	defer t.mu.RUnlock()

	node := t.root
	for _, ch := range prefix {
		node = node.next[ch]
		if node == nil {
			return false
		}
	}
	return true
}

func (t *Trie) SearchSuffix(suffix string) bool {
	t.mu.RLock()
	defer t.mu.RUnlock()

	node := t.root
	for i := len(suffix) - 1; i >= 0; i-- {
		ch := rune(suffix[i])
		node = node.prev[ch]
		if node == nil {
			return false
		}
	}
	return true
}

func main() {
	trie := NewTrie()
	trie.Insert("https://www.example.com/")
	fmt.Println(trie.SearchPrefix("https://www.example.com/about")) // false
	fmt.Println(trie.SearchPrefix("https://www.example.com"))       // true
	fmt.Println(trie.SearchSuffix("https://www.example.com/post"))  // false
	fmt.Println(trie.SearchPrefix("https://www.example1.com/"))     // false
	fmt.Println(trie.SearchSuffix("https://www.example.com/"))      // true
}

Rust
#

定义简单的Node结构表示FST节点,使用HashMap存储节点。(示例为全匹配)

use std::collections::HashMap;

#[derive(Debug)]
struct Node {
    is_end: bool,
    children: HashMap<char, Node>,
}

impl Node {
    fn new() -> Self {
        Node {
            is_end: false,
            children: HashMap::new(),
        }
    }
}

#[derive(Debug)]
pub struct FST {
    root: Node,
}

impl FST {
    pub fn new() -> Self {
        FST { root: Node::new() }
    }

    pub fn insert(&mut self, word: &str) {
        let mut current = &mut self.root;
        for ch in word.chars() {
            current = current.children.entry(ch).or_insert(Node::new());
        }
        current.is_end = true;
    }

    pub fn search(&self, word: &str) -> bool {
        let mut current = &self.root;
        for ch in word.chars() {
            match current.children.get(&ch) {
                Some(next) => current = next,
                None => return false,
            }
        }
        current.is_end
    }
}

pub fn fst_test() {
    let mut fst = FST::new();
    fst.insert("hello");
    fst.insert("world");
    println!("{:?}", fst);
    println!("{}", fst.search("hello1"));
    println!("{}", fst.search("world"));
    println!("{}", fst.search("hell"));
    println!("{}", fst.search("worl"));
}

Related

How Networks work 笔记
119 words·1 min
Book Note
《How Networks work》 # 第一章 浏览器生成消息 # 浏览器的第一步工作就是对URL进行解析
简单实现QPS计算
280 words·2 mins
Golang
Simple QPS # 实现一个简单的QPS(每秒查询率)测试。
Rust Book Datastructure - Arc
1242 words·6 mins
Rust
Rc 和 Arc 源码对比 # // Rc源码片段 impl<T: ?
Rust Book Datastructure - Rc
1236 words·6 mins
Rust
RC # use std::rc::Rc; use std::sync::Arc; use std::thread; fn main() { let a = Rc::new(String::from("test ref counting")); println!
Rust Book Datastructure - life-time
1450 words·7 mins
Rust
生命周期约束 HRTB # struct ImportantExcerpt<'a> { part: &'a str, } // 将 &'a 类型的生命周期强行转换为 &'b 类型,会报错,只有在 'a >= 'b 的情况 // 下, 'a 才能转换成 'b impl<'a: 'b, 'b> ImportantExcerpt<'a> { fn announce_and_return_part(&'a self, announcement: &'b str) -> &'b str { println!