mirror of
https://github.com/go-gitea/chardet.git
synced 2026-07-01 20:24:11 -04:00
Add logic to sort and dedup recognizer outputs
This commit is contained in:
+31
-6
@@ -2,6 +2,7 @@ package chardet
|
||||
|
||||
import (
|
||||
"errors"
|
||||
"sort"
|
||||
)
|
||||
|
||||
type Result struct {
|
||||
@@ -34,10 +35,6 @@ func (d *Detector) DetectBest(b []byte, stripTag bool, declaredCharset string) (
|
||||
return
|
||||
}
|
||||
|
||||
func matchHelper(r recognizer, input *recognizerInput, outputChan chan<- recognizerOutput) {
|
||||
outputChan <- r.Match(input)
|
||||
}
|
||||
|
||||
func (d *Detector) DetectAll(b []byte, stripTag bool, declaredCharset string) ([]Result, error) {
|
||||
input := newRecognizerInput(b, stripTag, declaredCharset)
|
||||
outputChan := make(chan recognizerOutput)
|
||||
@@ -46,7 +43,35 @@ func (d *Detector) DetectAll(b []byte, stripTag bool, declaredCharset string) ([
|
||||
}
|
||||
outputs := make([]recognizerOutput, 0, len(recognizers))
|
||||
for i := 0; i < len(recognizers); i++ {
|
||||
outputs = append(outputs, <-outputChan)
|
||||
o := <-outputChan
|
||||
if o.Confidence > 0 {
|
||||
outputs = append(outputs, <-outputChan)
|
||||
}
|
||||
}
|
||||
return nil, NotDetectedError
|
||||
if len(outputs) == 0 {
|
||||
return nil, NotDetectedError
|
||||
}
|
||||
|
||||
sort.Sort(recognizerOutputs(outputs))
|
||||
dedupOutputs := make([]Result, 0, len(outputs))
|
||||
foundCharsets := make(map[string]struct{}, len(outputs))
|
||||
for _, o := range outputs {
|
||||
if _, found := foundCharsets[o.Charset]; !found {
|
||||
dedupOutputs = append(dedupOutputs, Result(o))
|
||||
foundCharsets[o.Charset] = struct{}{}
|
||||
}
|
||||
}
|
||||
if len(dedupOutputs) == 0 {
|
||||
return nil, NotDetectedError
|
||||
}
|
||||
return dedupOutputs, nil
|
||||
}
|
||||
|
||||
func matchHelper(r recognizer, input *recognizerInput, outputChan chan<- recognizerOutput) {
|
||||
outputChan <- r.Match(input)
|
||||
}
|
||||
|
||||
type recognizerOutputs []recognizerOutput
|
||||
func (r recognizerOutputs) Len() int { return len(r) }
|
||||
func (r recognizerOutputs) Less(i, j int) bool { return r[i].Confidence > r[j].Confidence }
|
||||
func (r recognizerOutputs) Swap(i, j int) { r[i], r[j] = r[j], r[i] }
|
||||
|
||||
+1
-5
@@ -4,11 +4,7 @@ type recognizer interface {
|
||||
Match(*recognizerInput) recognizerOutput
|
||||
}
|
||||
|
||||
type recognizerOutput struct {
|
||||
Charset string
|
||||
Language string
|
||||
Confidence uint32
|
||||
}
|
||||
type recognizerOutput Result
|
||||
|
||||
type recognizerInput struct {
|
||||
raw []byte
|
||||
|
||||
Reference in New Issue
Block a user