Add logic to sort and dedup recognizer outputs

This commit is contained in:
Sheng Yu
2012-08-12 15:49:46 -07:00
parent a3955fbed8
commit 7bb0e42ca0
2 changed files with 32 additions and 11 deletions
+31 -6
View File
@@ -2,6 +2,7 @@ package chardet
import (
"errors"
"sort"
)
type Result struct {
@@ -34,10 +35,6 @@ func (d *Detector) DetectBest(b []byte, stripTag bool, declaredCharset string) (
return
}
func matchHelper(r recognizer, input *recognizerInput, outputChan chan<- recognizerOutput) {
outputChan <- r.Match(input)
}
func (d *Detector) DetectAll(b []byte, stripTag bool, declaredCharset string) ([]Result, error) {
input := newRecognizerInput(b, stripTag, declaredCharset)
outputChan := make(chan recognizerOutput)
@@ -46,7 +43,35 @@ func (d *Detector) DetectAll(b []byte, stripTag bool, declaredCharset string) ([
}
outputs := make([]recognizerOutput, 0, len(recognizers))
for i := 0; i < len(recognizers); i++ {
outputs = append(outputs, <-outputChan)
o := <-outputChan
if o.Confidence > 0 {
outputs = append(outputs, <-outputChan)
}
}
return nil, NotDetectedError
if len(outputs) == 0 {
return nil, NotDetectedError
}
sort.Sort(recognizerOutputs(outputs))
dedupOutputs := make([]Result, 0, len(outputs))
foundCharsets := make(map[string]struct{}, len(outputs))
for _, o := range outputs {
if _, found := foundCharsets[o.Charset]; !found {
dedupOutputs = append(dedupOutputs, Result(o))
foundCharsets[o.Charset] = struct{}{}
}
}
if len(dedupOutputs) == 0 {
return nil, NotDetectedError
}
return dedupOutputs, nil
}
func matchHelper(r recognizer, input *recognizerInput, outputChan chan<- recognizerOutput) {
outputChan <- r.Match(input)
}
type recognizerOutputs []recognizerOutput
func (r recognizerOutputs) Len() int { return len(r) }
func (r recognizerOutputs) Less(i, j int) bool { return r[i].Confidence > r[j].Confidence }
func (r recognizerOutputs) Swap(i, j int) { r[i], r[j] = r[j], r[i] }
+1 -5
View File
@@ -4,11 +4,7 @@ type recognizer interface {
Match(*recognizerInput) recognizerOutput
}
type recognizerOutput struct {
Charset string
Language string
Confidence uint32
}
type recognizerOutput Result
type recognizerInput struct {
raw []byte