fix: domain match

This commit is contained in:
mzz2017
2023-02-19 14:08:13 +08:00
parent 0d9892fff2
commit a011c2a74c
10 changed files with 411 additions and 63 deletions

View File

@ -7,10 +7,9 @@ package domain_matcher
import (
"fmt"
"github.com/openacid/slim/encode"
"github.com/openacid/slim/trie"
"github.com/v2rayA/ahocorasick-domain"
"github.com/v2rayA/dae/common/consts"
"github.com/v2rayA/dae/pkg/trie"
"regexp"
"sort"
"strings"
@ -21,7 +20,7 @@ type AhocorasickSlimtrie struct {
validTrieIndexes []int
validRegexpIndexes []int
ac []*ahocorasick.Matcher
trie []*trie.SlimTrie
trie []*trie.Trie
regexp [][]*regexp.Regexp
toBuildAc [][][]byte
@ -32,7 +31,7 @@ type AhocorasickSlimtrie struct {
func NewAhocorasickSlimtrie(bitLength int) *AhocorasickSlimtrie {
return &AhocorasickSlimtrie{
ac: make([]*ahocorasick.Matcher, bitLength),
trie: make([]*trie.SlimTrie, bitLength),
trie: make([]*trie.Trie, bitLength),
regexp: make([][]*regexp.Regexp, bitLength),
toBuildAc: make([][][]byte, bitLength),
toBuildTrie: make([][]string, bitLength),
@ -86,8 +85,7 @@ func (n *AhocorasickSlimtrie) MatchDomainBitmap(domain string) (bitmap []uint32)
N++
}
bitmap = make([]uint32, N)
// Add magic chars as head and tail.
domain = "^" + strings.ToLower(strings.TrimSuffix(domain, ".")) + "$"
domain = strings.ToLower(strings.TrimSuffix(domain, "."))
// Domain should consist of 'a'-'z' and '.' and '-'
for _, b := range []byte(domain) {
if !ahocorasick.IsValidChar(b) {
@ -95,23 +93,25 @@ func (n *AhocorasickSlimtrie) MatchDomainBitmap(domain string) (bitmap []uint32)
}
}
// Suffix matching.
suffixTrieDomain := ToSuffixTrieString(domain)
suffixTrieDomain := ToSuffixTrieString("^" + domain)
for _, i := range n.validTrieIndexes {
if bitmap[i/32]&(1<<(i%32)) > 0 {
// Already matched.
continue
}
if _, ok := n.trie[i].Get(suffixTrieDomain); ok {
if n.trie[i].HasPrefix(suffixTrieDomain) {
bitmap[i/32] |= 1 << (i % 32)
}
}
// Keyword matching.
// Add magic chars as head and tail.
acDomain := "^" + domain + "$"
for _, i := range n.validAcIndexes {
if bitmap[i/32]&(1<<(i%32)) > 0 {
// Already matched.
continue
}
if n.ac[i].Contains([]byte(domain)) {
if n.ac[i].Contains([]byte(acDomain)) {
bitmap[i/32] |= 1 << (i % 32)
}
}
@ -167,18 +167,13 @@ func (n *AhocorasickSlimtrie) Build() (err error) {
}
// Build succinct trie.
trueValue := true
for i, toBuild := range n.toBuildTrie {
if len(toBuild) == 0 {
continue
}
toBuild = ToSuffixTrieStrings(toBuild)
sort.Strings(toBuild)
n.trie[i], err = trie.NewSlimTrie(encode.Dummy{}, toBuild, nil, trie.Opt{
DedupValue: &trueValue,
// Set opt to complete to avoid false positive.
Complete: &trueValue,
})
n.trie[i] = trie.NewTrie(toBuild)
if err != nil {
return err
}

View File

@ -0,0 +1,59 @@
/*
* SPDX-License-Identifier: AGPL-3.0-only
* Copyright (c) 2023, v2rayA Organization <team@v2raya.org>
*/
package domain_matcher
import (
"github.com/sirupsen/logrus"
"github.com/v2rayA/dae/common/consts"
"golang.org/x/exp/slices"
"math/rand"
"testing"
)
func TestAhocorasickSlimtrie(t *testing.T) {
logrus.SetLevel(logrus.TraceLevel)
simulatedDomainSet, err := getDomain()
if err != nil {
t.Fatal(err)
}
bf := NewBruteforce(consts.MaxMatchSetLen)
actrie := NewAhocorasickSlimtrie(consts.MaxMatchSetLen)
for _, domains := range simulatedDomainSet {
bf.AddSet(domains.RuleIndex, domains.Domains, domains.Key)
actrie.AddSet(domains.RuleIndex, domains.Domains, domains.Key)
}
if err = bf.Build(); err != nil {
t.Fatal(err)
}
if err = actrie.Build(); err != nil {
t.Fatal(err)
}
rand.Seed(200)
for i := 0; i < 10000; i++ {
sample := TestSample[rand.Intn(len(TestSample))]
choice := rand.Intn(10)
switch {
case choice < 4:
addN := rand.Intn(5)
buf := make([]byte, addN)
for i := range buf {
buf[i] = 'a' + byte(rand.Intn('z'-'a'))
}
sample = string(buf) + "." + sample
case choice >= 4 && choice < 6:
k := rand.Intn(len(sample))
sample = sample[k:]
default:
}
bitmap := bf.MatchDomainBitmap(sample)
bitmap2 := actrie.MatchDomainBitmap(sample)
if !slices.Equal(bitmap, bitmap2) {
t.Fatal(i, sample, bitmap, bitmap2)
}
}
}

View File

@ -172,7 +172,13 @@ func BenchmarkBruteforce(b *testing.B) {
if err != nil {
b.Fatal(err)
}
bf := NewBruteforce(simulatedDomainSet)
bf := NewBruteforce(consts.MaxMatchSetLen)
for _, domains := range simulatedDomainSet {
bf.AddSet(domains.RuleIndex, domains.Domains, domains.Key)
}
if err = bf.Build(); err != nil {
b.Fatal(err)
}
b.StartTimer()
runBenchmark(b, bf)
}

View File

@ -6,6 +6,7 @@
package domain_matcher
import (
"fmt"
"github.com/v2rayA/dae/common/consts"
"github.com/v2rayA/dae/component/routing"
"regexp"
@ -17,18 +18,31 @@ type Bruteforce struct {
err error
}
func NewBruteforce(simulatedDomainSet []routing.DomainSet) *Bruteforce {
func NewBruteforce(bitLength int) *Bruteforce {
return &Bruteforce{
simulatedDomainSet: simulatedDomainSet,
simulatedDomainSet: make([]routing.DomainSet, bitLength),
}
}
func (n *Bruteforce) AddSet(bitIndex int, patterns []string, typ consts.RoutingDomainKey) {
if n.err != nil {
return
}
if len(n.simulatedDomainSet[bitIndex].Domains) != 0 {
n.err = fmt.Errorf("duplicated RuleIndex: %v", bitIndex)
return
}
n.simulatedDomainSet[bitIndex] = routing.DomainSet{
Key: typ,
RuleIndex: bitIndex,
Domains: patterns,
}
}
func (n *Bruteforce) MatchDomainBitmap(domain string) (bitmap []uint32) {
N := len(n.simulatedDomainSet) / 32
if len(n.simulatedDomainSet)%32 != 0 {
N++
}
domain = strings.ToLower(strings.TrimSuffix(domain, "."))
bitmap = make([]uint32, N)
for _, s := range n.simulatedDomainSet {
for _, d := range s.Domains {
@ -52,6 +66,7 @@ func (n *Bruteforce) MatchDomainBitmap(domain string) (bitmap []uint32) {
}
}
if hit {
//logrus.Traceln(d, s.Key, "matched given", domain)
bitmap[s.RuleIndex/32] |= 1 << (s.RuleIndex % 32)
break
}