optimize: memory occupation

This commit is contained in:
mzz2017
2023-02-18 21:39:46 +08:00
parent 5806f9bb39
commit b277212715
6 changed files with 239 additions and 132 deletions

View File

@ -1,125 +0,0 @@
/*
* SPDX-License-Identifier: AGPL-3.0-only
* Copyright (c) 2023, v2rayA Organization <team@v2raya.org>
*/
package domain_matcher
import (
"fmt"
"github.com/cloudflare/ahocorasick"
"github.com/v2rayA/dae/common/consts"
"regexp"
"strings"
)
type Ahocorasick struct {
validIndexes []int
validRegexpIndexes []int
matchers []*ahocorasick.Matcher
regexp [][]*regexp.Regexp
toBuild [][][]byte
err error
}
func NewAhocorasick(bitLength int) *Ahocorasick {
return &Ahocorasick{
matchers: make([]*ahocorasick.Matcher, bitLength),
toBuild: make([][][]byte, bitLength),
regexp: make([][]*regexp.Regexp, bitLength),
}
}
func (n *Ahocorasick) AddSet(bitIndex int, patterns []string, typ consts.RoutingDomainKey) {
if n.err != nil {
return
}
switch typ {
case consts.RoutingDomainKey_Full:
for _, d := range patterns {
n.toBuild[bitIndex] = append(n.toBuild[bitIndex], []byte("^"+d+"$"))
}
case consts.RoutingDomainKey_Suffix:
for _, d := range patterns {
if strings.HasPrefix(d, ".") {
// abc.example.com
n.toBuild[bitIndex] = append(n.toBuild[bitIndex], []byte(d+"$"))
} else {
// xxx.example.com
n.toBuild[bitIndex] = append(n.toBuild[bitIndex], []byte("."+d+"$"))
// example.com
n.toBuild[bitIndex] = append(n.toBuild[bitIndex], []byte("^"+d+"$"))
}
}
case consts.RoutingDomainKey_Keyword:
for _, d := range patterns {
n.toBuild[bitIndex] = append(n.toBuild[bitIndex], []byte(d))
}
case consts.RoutingDomainKey_Regex:
for _, d := range patterns {
r, err := regexp.Compile(d)
if err != nil {
n.err = fmt.Errorf("failed to compile regex: %v", d)
return
}
n.regexp[bitIndex] = append(n.regexp[bitIndex], r)
}
default:
n.err = fmt.Errorf("unknown RoutingDomainKey: %v", typ)
return
}
}
func (n *Ahocorasick) MatchDomainBitmap(domain string) (bitmap []uint32) {
N := len(n.matchers) / 32
if len(n.matchers)%32 != 0 {
N++
}
bitmap = make([]uint32, N)
// Domain should not contain ^ or $.
if strings.ContainsAny(domain, "^$") {
return bitmap
}
// Add magic chars as head and tail.
domain = "^" + strings.ToLower(strings.TrimSuffix(domain, ".")) + "$"
for _, i := range n.validIndexes {
if hits := n.matchers[i].MatchThreadSafe([]byte(domain)); len(hits) > 0 {
bitmap[i/32] |= 1 << (i % 32)
}
}
// Regex matching is independent.
for _, i := range n.validRegexpIndexes {
if bitmap[i/32]&(1<<(i%32)) > 0 {
// Already matched.
continue
}
for _, r := range n.regexp[i] {
if r.MatchString(domain) {
bitmap[i/32] |= 1 << (i % 32)
break
}
}
}
return bitmap
}
func (n *Ahocorasick) Build() error {
if n.err != nil {
return n.err
}
n.validIndexes = make([]int, 0, len(n.toBuild)/8)
for i, toBuild := range n.toBuild {
if len(toBuild) == 0 {
continue
}
n.matchers[i] = ahocorasick.NewMatcher(toBuild)
n.validIndexes = append(n.validIndexes, i)
}
for i := range n.regexp {
if len(n.regexp[i]) == 0 {
continue
}
n.validRegexpIndexes = append(n.validRegexpIndexes, i)
}
// Release it.
n.toBuild = nil
return nil
}

View File

@ -0,0 +1,195 @@
/*
* SPDX-License-Identifier: AGPL-3.0-only
* Copyright (c) 2023, v2rayA Organization <team@v2raya.org>
*/
package domain_matcher
import (
"fmt"
"github.com/openacid/slim/encode"
"github.com/openacid/slim/trie"
"github.com/v2rayA/ahocorasick-domain"
"github.com/v2rayA/dae/common/consts"
"regexp"
"sort"
"strings"
)
type AhocorasickSuccinctset struct {
validAcIndexes []int
validTrieIndexes []int
validRegexpIndexes []int
ac []*ahocorasick.Matcher
trie []*trie.SlimTrie
regexp [][]*regexp.Regexp
toBuildAc [][][]byte
toBuildTrie [][]string
err error
}
func NewAhocorasickSuccinctset(bitLength int) *AhocorasickSuccinctset {
return &AhocorasickSuccinctset{
ac: make([]*ahocorasick.Matcher, bitLength),
trie: make([]*trie.SlimTrie, bitLength),
regexp: make([][]*regexp.Regexp, bitLength),
toBuildAc: make([][][]byte, bitLength),
toBuildTrie: make([][]string, bitLength),
}
}
func (n *AhocorasickSuccinctset) AddSet(bitIndex int, patterns []string, typ consts.RoutingDomainKey) {
if n.err != nil {
return
}
switch typ {
case consts.RoutingDomainKey_Full:
for _, d := range patterns {
n.toBuildTrie[bitIndex] = append(n.toBuildTrie[bitIndex], "^"+d+"$")
}
case consts.RoutingDomainKey_Suffix:
for _, d := range patterns {
if strings.HasPrefix(d, ".") {
// abc.example.com
n.toBuildTrie[bitIndex] = append(n.toBuildTrie[bitIndex], d+"$")
// cannot match example.com
} else {
// xxx.example.com
n.toBuildTrie[bitIndex] = append(n.toBuildTrie[bitIndex], "."+d+"$")
// example.com
n.toBuildTrie[bitIndex] = append(n.toBuildTrie[bitIndex], "^"+d+"$")
// cannot match abcexample.com
}
}
case consts.RoutingDomainKey_Keyword:
// Only use ac automaton for "keyword" matching to save memory.
for _, d := range patterns {
n.toBuildAc[bitIndex] = append(n.toBuildAc[bitIndex], []byte(d))
}
case consts.RoutingDomainKey_Regex:
for _, d := range patterns {
r, err := regexp.Compile(d)
if err != nil {
n.err = fmt.Errorf("failed to compile regex: %v", d)
return
}
n.regexp[bitIndex] = append(n.regexp[bitIndex], r)
}
default:
n.err = fmt.Errorf("unknown RoutingDomainKey: %v", typ)
return
}
}
func (n *AhocorasickSuccinctset) MatchDomainBitmap(domain string) (bitmap []uint32) {
N := len(n.ac) / 32
if len(n.ac)%32 != 0 {
N++
}
bitmap = make([]uint32, N)
// Add magic chars as head and tail.
domain = "^" + strings.ToLower(strings.TrimSuffix(domain, ".")) + "$"
// Domain should consist of 'a'-'z' and '.' and '-'
for _, b := range []byte(domain) {
if !ahocorasick.IsValidChar(b) {
return bitmap
}
}
// Suffix matching.
suffixTrieDomain := ToSuffixTrieString(domain)
for _, i := range n.validTrieIndexes {
if bitmap[i/32]&(1<<(i%32)) > 0 {
// Already matched.
continue
}
if _, ok := n.trie[i].Get(suffixTrieDomain); ok {
bitmap[i/32] |= 1 << (i % 32)
}
}
// Keyword matching.
for _, i := range n.validAcIndexes {
if bitmap[i/32]&(1<<(i%32)) > 0 {
// Already matched.
continue
}
if hits := n.ac[i].MatchThreadSafe([]byte(domain)); len(hits) > 0 {
bitmap[i/32] |= 1 << (i % 32)
}
}
// Regex matching.
for _, i := range n.validRegexpIndexes {
if bitmap[i/32]&(1<<(i%32)) > 0 {
// Already matched.
continue
}
for _, r := range n.regexp[i] {
if r.MatchString(domain) {
bitmap[i/32] |= 1 << (i % 32)
break
}
}
}
return bitmap
}
func ToSuffixTrieString(s string) string {
// No need for end char "$".
b := []byte(strings.TrimSuffix(s, "$"))
// Reverse.
half := len(b) / 2
for i := 0; i < half; i++ {
b[i], b[len(b)-i-1] = b[len(b)-i-1], b[i]
}
return string(b)
}
func ToSuffixTrieStrings(s []string) []string {
to := make([]string, len(s))
for i := range s {
to[i] = ToSuffixTrieString(s[i])
}
return to
}
func (n *AhocorasickSuccinctset) Build() (err error) {
if n.err != nil {
return n.err
}
n.validAcIndexes = make([]int, 0, len(n.toBuildAc)/8)
n.validTrieIndexes = make([]int, 0, len(n.toBuildAc)/8)
n.validRegexpIndexes = make([]int, 0, len(n.toBuildAc)/8)
// Build AC automaton.
for i, toBuild := range n.toBuildAc {
if len(toBuild) == 0 {
continue
}
n.ac[i], err = ahocorasick.NewMatcher(toBuild)
if err != nil {
return err
}
n.validAcIndexes = append(n.validAcIndexes, i)
}
// Build succinct trie.
for i, toBuild := range n.toBuildTrie {
if len(toBuild) == 0 {
continue
}
toBuild = ToSuffixTrieStrings(toBuild)
sort.Strings(toBuild)
n.trie[i], err = trie.NewSlimTrie(encode.I8{}, toBuild, nil)
if err != nil {
return err
}
n.validTrieIndexes = append(n.validTrieIndexes, i)
}
// Regexp.
for i := range n.regexp {
if len(n.regexp[i]) == 0 {
continue
}
n.validRegexpIndexes = append(n.validRegexpIndexes, i)
}
// Release unused data.
n.toBuildAc = nil
n.toBuildTrie = nil
return nil
}

View File

@ -195,14 +195,14 @@ func BenchmarkGoRegexpNfa(b *testing.B) {
runBenchmark(b, nfa)
}
func BenchmarkAhocorasick(b *testing.B) {
func BenchmarkAhocorasickSuccinctset(b *testing.B) {
b.StopTimer()
logrus.SetLevel(logrus.WarnLevel)
simulatedDomainSet, err := getDomain()
if err != nil {
b.Fatal(err)
}
ahocorasick := NewAhocorasick(consts.MaxMatchSetLen)
ahocorasick := NewAhocorasickSuccinctset(consts.MaxMatchSetLen)
for _, domains := range simulatedDomainSet {
ahocorasick.AddSet(domains.RuleIndex, domains.Domains, domains.Key)
}