mirror of
https://github.com/go-gitea/chardet.git
synced 2026-07-01 20:24:11 -04:00
Add detector_test and fix bugs found by it
This commit is contained in:
@@ -0,0 +1,22 @@
|
||||
package chardet
|
||||
|
||||
import (
|
||||
"testing"
|
||||
)
|
||||
|
||||
func TestDetector(t *testing.T) {
|
||||
type file_charset_language struct {
|
||||
File, Charset, Language string
|
||||
}
|
||||
var data = []file_charset_language{
|
||||
{"utf8.txt", "UTF-8", ""},
|
||||
{"big5.txt", "Big5", "zh"},
|
||||
{"shift_jis.txt", "Shift_JIS", "ja"},
|
||||
{"gb18030.txt", "GB-18030", "zh"},
|
||||
}
|
||||
|
||||
ct := newChardetTester()
|
||||
for _, d := range data {
|
||||
ct.ExpectBest(embeddedfiles[d.File], d.Charset, d.Language, t)
|
||||
}
|
||||
}
|
||||
+1
-1
@@ -78,7 +78,7 @@ func (r *recognizerMultiByte) matchConfidence(input *recognizerInput) int {
|
||||
|
||||
func binarySearch(l []uint16, c uint16) bool {
|
||||
start := 0
|
||||
end := len(l)
|
||||
end := len(l) - 1
|
||||
for start <= end {
|
||||
mid := (start + end) / 2
|
||||
if c == l[mid] {
|
||||
|
||||
+6
-6
@@ -48,15 +48,15 @@ func (*recognizerUtf16le) Match(input *recognizerInput) (output recognizerOutput
|
||||
type recognizerUtf32 struct {
|
||||
name string
|
||||
bom []byte
|
||||
decodeChar func(input []byte) rune
|
||||
decodeChar func(input []byte) uint32
|
||||
}
|
||||
|
||||
func decodeUtf32be(input []byte) rune {
|
||||
return rune(input[0]<<24 | input[1]<<16 | input[2]<<8 | input[3])
|
||||
func decodeUtf32be(input []byte) uint32 {
|
||||
return uint32(input[0])<<24 | uint32(input[1])<<16 | uint32(input[2])<<8 | uint32(input[3])
|
||||
}
|
||||
|
||||
func decodeUtf32le(input []byte) rune {
|
||||
return rune(input[3]<<24 | input[2]<<16 | input[1]<<8 | input[0])
|
||||
func decodeUtf32le(input []byte) uint32 {
|
||||
return uint32(input[3])<<24 | uint32(input[2])<<16 | uint32(input[1])<<8 | uint32(input[0])
|
||||
}
|
||||
|
||||
func newRecognizer_utf32be() *recognizerUtf32 {
|
||||
@@ -82,7 +82,7 @@ func (r *recognizerUtf32) Match(input *recognizerInput) (output recognizerOutput
|
||||
hasBom := bytes.HasPrefix(input.raw, r.bom)
|
||||
var numValid, numInvalid uint32
|
||||
for b := input.raw; len(b) >= 4; b = b[4:] {
|
||||
if c := r.decodeChar(b); c < 0 || c >= 0x10FFFF || (c >= 0xD800 && c <= 0xDFFF) {
|
||||
if c := r.decodeChar(b); c >= 0x10FFFF || (c >= 0xD800 && c <= 0xDFFF) {
|
||||
numInvalid++
|
||||
} else {
|
||||
numValid++
|
||||
|
||||
Reference in New Issue
Block a user