2019-12-12 17:50:19 +00:00
|
|
|
// Copyright 2019 Mozilla Foundation. See the COPYRIGHT
|
|
|
|
// file at the top-level directory of this distribution.
|
|
|
|
//
|
|
|
|
// Licensed under the Apache License, Version 2.0 <LICENSE-APACHE or
|
|
|
|
// https://www.apache.org/licenses/LICENSE-2.0> or the MIT license
|
|
|
|
// <LICENSE-MIT or https://opensource.org/licenses/MIT>, at your
|
|
|
|
// option. This file may not be copied, modified, or distributed
|
|
|
|
// except according to those terms.
|
|
|
|
|
|
|
|
// Mostly copied and pasted from
|
|
|
|
// third_party/rust/chardetng/src/lib.rs , so
|
|
|
|
// "top-level directory of this distribution" above refers to
|
|
|
|
// third_party/rust/chardetng/
|
|
|
|
|
|
|
|
#ifndef mozilla_EncodingDetector_h
|
|
|
|
#define mozilla_EncodingDetector_h
|
|
|
|
|
|
|
|
#include "mozilla/Encoding.h"
|
|
|
|
|
|
|
|
namespace mozilla {
|
|
|
|
class EncodingDetector;
|
|
|
|
}; // namespace mozilla
|
|
|
|
|
|
|
|
#define CHARDETNG_ENCODING_DETECTOR mozilla::EncodingDetector
|
|
|
|
|
|
|
|
#include "chardetng.h"
|
|
|
|
|
|
|
|
namespace mozilla {
|
|
|
|
|
|
|
|
/**
|
|
|
|
* A Web browser-oriented detector for guessing what character
|
|
|
|
* encoding a stream of bytes is encoded in.
|
|
|
|
*
|
|
|
|
* The bytes are fed to the detector incrementally using the `feed`
|
|
|
|
* method. The current guess of the detector can be queried using
|
|
|
|
* the `guess` method. The guessing parameters are arguments to the
|
|
|
|
* `guess` method rather than arguments to the constructor in order
|
|
|
|
* to enable the application to check if the arguments affect the
|
|
|
|
* guessing outcome. (The specific use case is to disable UI for
|
|
|
|
* re-running the detector with UTF-8 allowed and the top-level
|
|
|
|
* domain name ignored if those arguments don't change the guess.)
|
|
|
|
*/
|
|
|
|
class EncodingDetector final {
|
|
|
|
public:
|
|
|
|
~EncodingDetector() = default;
|
|
|
|
|
|
|
|
static void operator delete(void* aDetector) {
|
|
|
|
chardetng_encoding_detector_free(
|
|
|
|
reinterpret_cast<EncodingDetector*>(aDetector));
|
|
|
|
}
|
|
|
|
|
|
|
|
/**
|
|
|
|
* Creates a new instance of the detector.
|
|
|
|
*/
|
|
|
|
static inline UniquePtr<EncodingDetector> Create() {
|
|
|
|
UniquePtr<EncodingDetector> detector(chardetng_encoding_detector_new());
|
|
|
|
return detector;
|
|
|
|
}
|
|
|
|
|
2021-01-24 00:11:07 +00:00
|
|
|
/**
|
|
|
|
* Queries whether the TLD is considered non-generic and could affect the
|
|
|
|
* guess.
|
|
|
|
*/
|
|
|
|
static inline bool TldMayAffectGuess(Span<const char> aTLD) {
|
|
|
|
return chardetng_encoding_detector_tld_may_affect_guess(aTLD.Elements(),
|
|
|
|
aTLD.Length());
|
|
|
|
}
|
|
|
|
|
2019-12-12 17:50:19 +00:00
|
|
|
/**
|
|
|
|
* Inform the detector of a chunk of input.
|
|
|
|
*
|
|
|
|
* The byte stream is represented as a sequence of calls to this
|
|
|
|
* method such that the concatenation of the arguments to this
|
|
|
|
* method form the byte stream. It does not matter how the application
|
|
|
|
* chooses to chunk the stream. It is OK to call this method with
|
|
|
|
* a zero-length byte slice.
|
|
|
|
*
|
|
|
|
* The end of the stream is indicated by calling this method with
|
|
|
|
* `aLast` set to `true`. In that case, the end of the stream is
|
|
|
|
* considered to occur after the last byte of the `aBuffer` (which
|
|
|
|
* may be zero-length) passed in the same call. Once this method
|
|
|
|
* has been called with `last` set to `true` this method must not
|
|
|
|
* be called again.
|
|
|
|
*
|
|
|
|
* If you want to perform detection on just the prefix of a longer
|
|
|
|
* stream, do not pass `aLast=true` after the prefix if the stream
|
|
|
|
* actually still continues.
|
|
|
|
*
|
|
|
|
* Returns `true` if after processing `aBuffer` the stream has
|
|
|
|
* contained at least one non-ASCII byte and `false` if only
|
|
|
|
* ASCII has been seen so far.
|
|
|
|
*
|
|
|
|
* # Panics
|
|
|
|
*
|
|
|
|
* If this method has previously been called with `aLast` set to `true`.
|
|
|
|
*/
|
|
|
|
inline bool Feed(Span<const uint8_t> aBuffer, bool aLast) {
|
|
|
|
return chardetng_encoding_detector_feed(this, aBuffer.Elements(),
|
|
|
|
aBuffer.Length(), aLast);
|
|
|
|
}
|
|
|
|
|
|
|
|
/**
|
|
|
|
* Guess the encoding given the bytes pushed to the detector so far
|
|
|
|
* (via `Feed()`), the top-level domain name from which the bytes were
|
|
|
|
* loaded, and an indication of whether to consider UTF-8 as a permissible
|
|
|
|
* guess.
|
|
|
|
*
|
|
|
|
* The `aTld` argument takes the rightmost DNS label of the hostname of the
|
|
|
|
* host the stream was loaded from in lower-case ASCII form. That is, if
|
|
|
|
* the label is an internationalized top-level domain name, it must be
|
|
|
|
* provided in its Punycode form. If the TLD that the stream was loaded
|
|
|
|
* from is unavalable, an empty `Spane` may be passed instead, which is
|
|
|
|
* equivalent to passing a `Span` for "com".
|
|
|
|
*
|
|
|
|
* If the `aAllowUTF8` argument is set to `false`, the return value of
|
|
|
|
* this method won't be `UTF_8_ENCODING`. When performing detection
|
|
|
|
* on `text/html` on non-`file:` URLs, Web browsers must pass `false`,
|
|
|
|
* unless the user has taken a specific contextual action to request an
|
|
|
|
* override. This way, Web developers cannot start depending on UTF-8
|
|
|
|
* detection. Such reliance would make the Web Platform more brittle.
|
|
|
|
*
|
|
|
|
* Returns the guessed encoding.
|
|
|
|
*
|
|
|
|
* # Panics
|
|
|
|
*
|
|
|
|
* If `aTld` contains non-ASCII, period, or upper-case letters. (The panic
|
|
|
|
* condition is intentionally limited to signs of failing to extract the
|
|
|
|
* label correctly, failing to provide it in its Punycode form, and failure
|
|
|
|
* to lower-case it. Full DNS label validation is intentionally not performed
|
|
|
|
* to avoid panics when the reality doesn't match the specs.)
|
|
|
|
*/
|
|
|
|
inline mozilla::NotNull<const mozilla::Encoding*> Guess(
|
|
|
|
Span<const char> aTLD, bool aAllowUTF8) const {
|
|
|
|
return WrapNotNull(chardetng_encoding_detector_guess(
|
|
|
|
this, aTLD.Elements(), aTLD.Length(), aAllowUTF8));
|
|
|
|
}
|
|
|
|
|
|
|
|
private:
|
|
|
|
EncodingDetector() = delete;
|
|
|
|
EncodingDetector(const EncodingDetector&) = delete;
|
|
|
|
EncodingDetector& operator=(const EncodingDetector&) = delete;
|
|
|
|
};
|
|
|
|
|
|
|
|
}; // namespace mozilla
|
|
|
|
|
|
|
|
#endif // mozilla_EncodingDetector_h
|