Add detector_test and fix bugs found by it

This commit is contained in:
Sheng Yu
2012-08-14 23:18:41 -07:00
parent f39cc110e6
commit b1ae293603
3 changed files with 29 additions and 7 deletions
+22
View File
@@ -0,0 +1,22 @@
package chardet
import (
"testing"
)
func TestDetector(t *testing.T) {
type file_charset_language struct {
File, Charset, Language string
}
var data = []file_charset_language{
{"utf8.txt", "UTF-8", ""},
{"big5.txt", "Big5", "zh"},
{"shift_jis.txt", "Shift_JIS", "ja"},
{"gb18030.txt", "GB-18030", "zh"},
}
ct := newChardetTester()
for _, d := range data {
ct.ExpectBest(embeddedfiles[d.File], d.Charset, d.Language, t)
}
}
+1 -1
View File
@@ -78,7 +78,7 @@ func (r *recognizerMultiByte) matchConfidence(input *recognizerInput) int {
func binarySearch(l []uint16, c uint16) bool {
start := 0
end := len(l)
end := len(l) - 1
for start <= end {
mid := (start + end) / 2
if c == l[mid] {
+6 -6
View File
@@ -48,15 +48,15 @@ func (*recognizerUtf16le) Match(input *recognizerInput) (output recognizerOutput
type recognizerUtf32 struct {
name string
bom []byte
decodeChar func(input []byte) rune
decodeChar func(input []byte) uint32
}
func decodeUtf32be(input []byte) rune {
return rune(input[0]<<24 | input[1]<<16 | input[2]<<8 | input[3])
func decodeUtf32be(input []byte) uint32 {
return uint32(input[0])<<24 | uint32(input[1])<<16 | uint32(input[2])<<8 | uint32(input[3])
}
func decodeUtf32le(input []byte) rune {
return rune(input[3]<<24 | input[2]<<16 | input[1]<<8 | input[0])
func decodeUtf32le(input []byte) uint32 {
return uint32(input[3])<<24 | uint32(input[2])<<16 | uint32(input[1])<<8 | uint32(input[0])
}
func newRecognizer_utf32be() *recognizerUtf32 {
@@ -82,7 +82,7 @@ func (r *recognizerUtf32) Match(input *recognizerInput) (output recognizerOutput
hasBom := bytes.HasPrefix(input.raw, r.bom)
var numValid, numInvalid uint32
for b := input.raw; len(b) >= 4; b = b[4:] {
if c := r.decodeChar(b); c < 0 || c >= 0x10FFFF || (c >= 0xD800 && c <= 0xDFFF) {
if c := r.decodeChar(b); c >= 0x10FFFF || (c >= 0xD800 && c <= 0xDFFF) {
numInvalid++
} else {
numValid++