summaryrefslogtreecommitdiff
diff options
context:
space:
mode:
Diffstat (limited to 'app-i18n/uchardet/files/uchardet-0.0.5-fix-ASCII-detection.patch')
-rw-r--r--app-i18n/uchardet/files/uchardet-0.0.5-fix-ASCII-detection.patch116
1 files changed, 116 insertions, 0 deletions
diff --git a/app-i18n/uchardet/files/uchardet-0.0.5-fix-ASCII-detection.patch b/app-i18n/uchardet/files/uchardet-0.0.5-fix-ASCII-detection.patch
new file mode 100644
index 000000000000..c82aee866ebc
--- /dev/null
+++ b/app-i18n/uchardet/files/uchardet-0.0.5-fix-ASCII-detection.patch
@@ -0,0 +1,116 @@
+commit 4c8316f9cfda38d75fb015c0eb40e0eebb03d28f
+Author: Jehan <jehan@girinstud.io>
+Date: Sat Dec 5 21:04:20 2015 +0100
+
+ Nearly-ASCII text with NBSP is still not ASCII.
+
+ There is no "exception" in encoding. The non-breaking space 0xA0 is not
+ ASCII, and therefore returning "ASCII" will later create issues (for
+ instance trying to re-encode with iconv produces an error).
+ This was obviously an explicit decision in original code (according to
+ code comments), probably tied to specifity of the original program from
+ Mozilla. Now we want strict detection.
+ I will return "ISO-8859-1" for "nearly-ASCII texts with NBSP as only
+ exception" (note that I could have returned any ISO-8859 charsets since
+ they all have this character in common).
+
+diff --git a/src/nsUniversalDetector.cpp b/src/nsUniversalDetector.cpp
+index ab8bae0..ff06b9d 100644
+--- a/src/nsUniversalDetector.cpp
++++ b/src/nsUniversalDetector.cpp
+@@ -47,6 +47,7 @@
+
+ nsUniversalDetector::nsUniversalDetector(PRUint32 aLanguageFilter)
+ {
++ mNbspFound = PR_FALSE;
+ mDone = PR_FALSE;
+ mBestGuess = -1; //illegal value as signal
+ mInTag = PR_FALSE;
+@@ -75,6 +76,7 @@ nsUniversalDetector::~nsUniversalDetector()
+ void
+ nsUniversalDetector::Reset()
+ {
++ mNbspFound = PR_FALSE;
+ mDone = PR_FALSE;
+ mBestGuess = -1; //illegal value as signal
+ mInTag = PR_FALSE;
+@@ -162,9 +164,10 @@ nsresult nsUniversalDetector::HandleData(const char* aBuf, PRUint32 aLen)
+ PRUint32 i;
+ for (i = 0; i < aLen; i++)
+ {
+- /* Other than 0xA0, if every other character is ASCII, the page is ASCII.
++ /* If every other character is ASCII or 0xA0, we don't run charset
++ * probers.
+ * 0xA0 (NBSP in a few charset) is apparently a rare exception
+- * of non-ASCII character contained in ASCII text. */
++ * of non-ASCII character often contained in nearly-ASCII text. */
+ if (aBuf[i] & '\x80' && aBuf[i] != '\xA0')
+ {
+ /* We got a non-ASCII byte (high-byte) */
+@@ -203,11 +206,19 @@ nsresult nsUniversalDetector::HandleData(const char* aBuf, PRUint32 aLen)
+ }
+ else
+ {
+- //ok, just pure ascii so far
+- if ( ePureAscii == mInputState &&
+- (aBuf[i] == '\033' || (aBuf[i] == '{' && mLastChar == '~')) )
++ /* Just pure ASCII or NBSP so far. */
++ if (aBuf[i] == '\xA0')
+ {
+- //found escape character or HZ "~{"
++ /* ASCII with the only exception of NBSP seems quite common.
++ * I doubt it is really necessary to train a model here, so let's
++ * just make an exception.
++ */
++ mNbspFound = PR_TRUE;
++ }
++ else if (mInputState == ePureAscii &&
++ (aBuf[i] == '\033' || (aBuf[i] == '{' && mLastChar == '~')))
++ {
++ /* We found an escape character or HZ "~{". */
+ mInputState = eEscAscii;
+ }
+ mLastChar = aBuf[i];
+@@ -229,6 +240,10 @@ nsresult nsUniversalDetector::HandleData(const char* aBuf, PRUint32 aLen)
+ mDone = PR_TRUE;
+ mDetectedCharset = mEscCharSetProber->GetCharSetName();
+ }
++ else if (mNbspFound)
++ {
++ mDetectedCharset = "ISO-8859-1";
++ }
+ else
+ {
+ /* ASCII with the ESC character (or the sequence "~{") is still
+@@ -253,8 +268,17 @@ nsresult nsUniversalDetector::HandleData(const char* aBuf, PRUint32 aLen)
+ break;
+
+ default:
+- /* Pure ASCII */
+- mDetectedCharset = "ASCII";
++ if (mNbspFound)
++ {
++ /* ISO-8859-1 is a good result candidate for ASCII + NBSP.
++ * (though it could have been any ISO-8859 encoding). */
++ mDetectedCharset = "ISO-8859-1";
++ }
++ else
++ {
++ /* Pure ASCII */
++ mDetectedCharset = "ASCII";
++ }
+ break;
+ }
+ return NS_OK;
+diff --git a/src/nsUniversalDetector.h b/src/nsUniversalDetector.h
+index 4d9b460..9f0a4b1 100644
+--- a/src/nsUniversalDetector.h
++++ b/src/nsUniversalDetector.h
+@@ -72,6 +72,7 @@ protected:
+ virtual void Report(const char* aCharset) = 0;
+ virtual void Reset();
+ nsInputState mInputState;
++ PRBool mNbspFound;
+ PRBool mDone;
+ PRBool mInTag;
+ PRBool mStart;