Browse Source

Improve cchardet integration

Signed-off-by: Pierre-Yves Chibon <pingou@pingoured.fr>
Pierre-Yves Chibon 3 years ago
parent
commit
df55e8b96b

+ 13 - 4
pagure/lib/encoding_utils.py

@@ -51,14 +51,23 @@ def detect_encodings(data):
     # of the detector to bias the utf-8 result.
     if cchardet is not None:
         detector = cchardet.UniversalDetector()
+        detector.reset()
+        detector.feed(data)
+        detector.close()
+        result = detector.result
     else:
         detector = universaldetector.UniversalDetector()
-    detector.reset()
-    detector.feed(data)
-    result = detector.close()
-    if not result:
+        detector.reset()
+        detector.feed(data)
+        result = detector.close()
+
+    if not result or not result["encoding"]:
         return {"utf-8": 1.0}
     encodings = {result["encoding"]: result["confidence"]}
+
+    if cchardet:
+        return encodings
+
     if ch_version[0] in ("3", "4"):
         for prober in detector._charset_probers:
             if hasattr(prober, "probers"):

+ 16 - 40
tests/test_pagure_flask_ui_repo.py

@@ -3176,16 +3176,10 @@ class PagureFlaskRepotests(tests.Modeltests):
         output = self.app.get("/test/raw/master")
         self.assertEqual(output.status_code, 200)
         output_text = output.get_data(as_text=True)
-        if cchardet is not None:
-            self.assertEqual(
-                output.headers["Content-Type"].lower(),
-                "text/plain; charset=utf-8",
-            )
-        else:
-            self.assertEqual(
-                output.headers["Content-Type"].lower(),
-                "text/plain; charset=ascii",
-            )
+        self.assertEqual(
+            output.headers["Content-Type"].lower(),
+            "text/plain; charset=ascii",
+        )
         self.assertIn(":Author: Pierre-Yves Chibon", output_text)
 
         # Add some more content to the repo
@@ -3204,16 +3198,10 @@ class PagureFlaskRepotests(tests.Modeltests):
 
         # View in a branch
         output = self.app.get("/test/raw/master/f/sources")
-        if cchardet is not None:
-            self.assertEqual(
-                output.headers["Content-Type"].lower(),
-                "text/plain; charset=utf-8",
-            )
-        else:
-            self.assertEqual(
-                output.headers["Content-Type"].lower(),
-                "text/plain; charset=ascii",
-            )
+        self.assertEqual(
+            output.headers["Content-Type"].lower(),
+            "text/plain; charset=ascii",
+        )
         self.assertEqual(output.status_code, 200)
         output_text = output.get_data(as_text=True)
         self.assertIn("foo\n bar", output_text)
@@ -3264,16 +3252,10 @@ class PagureFlaskRepotests(tests.Modeltests):
         output = self.app.get("/test/raw/master")
         self.assertEqual(output.status_code, 200)
         output_text = output.get_data(as_text=True)
-        if cchardet is not None:
-            self.assertEqual(
-                output.headers["Content-Type"].lower(),
-                "text/plain; charset=utf-8",
-            )
-        else:
-            self.assertEqual(
-                output.headers["Content-Type"].lower(),
-                "text/plain; charset=ascii",
-            )
+        self.assertEqual(
+            output.headers["Content-Type"].lower(),
+            "text/plain; charset=ascii",
+        )
         self.assertTrue(
             output_text.startswith("diff --git a/test_binary b/test_binary\n")
         )
@@ -3311,16 +3293,10 @@ class PagureFlaskRepotests(tests.Modeltests):
         output = self.app.get("/fork/pingou/test3/raw/master/f/sources")
         self.assertEqual(output.status_code, 200)
         output_text = output.get_data(as_text=True)
-        if cchardet is not None:
-            self.assertEqual(
-                output.headers["Content-Type"].lower(),
-                "text/plain; charset=utf-8",
-            )
-        else:
-            self.assertEqual(
-                output.headers["Content-Type"].lower(),
-                "text/plain; charset=ascii",
-            )
+        self.assertEqual(
+            output.headers["Content-Type"].lower(),
+            "text/plain; charset=ascii",
+        )
         self.assertIn("foo\n bar", output_text)
 
     def test_view_commit(self):

+ 38 - 33
tests/test_pagure_lib_encoding_utils.py

@@ -32,7 +32,7 @@ class TestGuessEncoding(unittest.TestCase):
         data = "Twas bryllyg, and the slythy toves did gyre and gymble"
         result = encoding_utils.guess_encoding(data.encode("ascii"))
         if cchardet is not None:
-            self.assertEqual(result, "utf-8")
+            self.assertEqual(result, "ASCII")
         else:
             self.assertEqual(result, "ascii")
 
@@ -46,11 +46,14 @@ class TestGuessEncoding(unittest.TestCase):
         data = "Šabata".encode("utf-8")
         result = encoding_utils.guess_encoding(data)
         chardet_result = chardet.detect(data)
-        self.assertEqual(result, "utf-8")
-        if chardet.__version__[0] == "3":
-            self.assertEqual(chardet_result["encoding"], "ISO-8859-9")
+        if cchardet:
+            self.assertEqual(result, "WINDOWS-1250")
         else:
-            self.assertEqual(chardet_result["encoding"], "ISO-8859-2")
+            self.assertEqual(result, "utf-8")
+            if chardet.__version__[0] in ("3", "4"):
+                self.assertEqual(chardet_result["encoding"], "ISO-8859-9")
+            else:
+                self.assertEqual(chardet_result["encoding"], "ISO-8859-2")
 
     def test_guess_encoding_no_data(self):
         """ Test encoding_utils.guess_encoding() with an empty string """
@@ -64,25 +67,22 @@ class TestGuessEncodings(unittest.TestCase):
         data = "Šabata".encode("utf-8")
         result = encoding_utils.guess_encodings(data)
         chardet_result = chardet.detect(data)
-        if chardet.__version__[0] == "3":
-            # The first three have different confidence values
-            if cchardet is not None:
-                expexted_list = ["utf-8"]
-                # The last one in the list (which apparently has only one)
-                self.assertEqual(result[-1].encoding, "utf-8")
-            else:
+        if cchardet is not None:
+            # The last one in the list (which apparently has only one)
+            self.assertEqual(result[-1].encoding, "WINDOWS-1250")
+        else:
+            if chardet.__version__[0] in ("3", "4"):
+                # The first three have different confidence values
                 expexted_list = ["utf-8", "ISO-8859-9", "ISO-8859-1"]
                 # This is the one with the least confidence
                 self.assertEqual(result[-1].encoding, "windows-1255")
-            self.assertListEqual(
-                [encoding.encoding for encoding in result][:3], expexted_list
-            )
-
-            # The values in the middle of the list all have the same confidence
-            # value and can't be sorted reliably: use sets.
-            if cchardet is not None:
-                expected_list = sorted(["utf-8"])
-            else:
+                self.assertListEqual(
+                    [encoding.encoding for encoding in result][:3],
+                    expexted_list,
+                )
+
+                # The values in the middle of the list all have the same confidence
+                # value and can't be sorted reliably: use sets.
                 expected_list = sorted(
                     [
                         "utf-8",
@@ -107,17 +107,17 @@ class TestGuessEncodings(unittest.TestCase):
                         "windows-1255",
                     ]
                 )
-            self.assertListEqual(
-                sorted(set([encoding.encoding for encoding in result])),
-                expected_list,
-            )
-            self.assertEqual(chardet_result["encoding"], "ISO-8859-9")
-        else:
-            self.assertListEqual(
-                [encoding.encoding for encoding in result],
-                ["utf-8", "ISO-8859-2", "windows-1252"],
-            )
-            self.assertEqual(chardet_result["encoding"], "ISO-8859-2")
+                self.assertListEqual(
+                    sorted(set([encoding.encoding for encoding in result])),
+                    expected_list,
+                )
+                self.assertEqual(chardet_result["encoding"], "ISO-8859-9")
+            else:
+                self.assertListEqual(
+                    [encoding.encoding for encoding in result],
+                    ["utf-8", "ISO-8859-2", "windows-1252"],
+                )
+                self.assertEqual(chardet_result["encoding"], "ISO-8859-2")
 
     def test_guess_encodings_no_data(self):
         """ Test encoding_utils.guess_encodings() with an emtpy string """
@@ -128,7 +128,12 @@ class TestGuessEncodings(unittest.TestCase):
 class TestDecode(unittest.TestCase):
     def test_decode(self):
         """ Test encoding_utils.decode() """
-        data = "Šabata"
+        data = (
+            "This is a little longer text for testing Šabata's encoding. "
+            "With more characters, let's see if it become more clear as to what "
+            "encoding should be used for this. We'll include from french words "
+            "in there for non-ascii: français, gagné!"
+        )
         self.assertEqual(data, encoding_utils.decode(data.encode("utf-8")))
 
 

+ 3 - 3
tests/test_pagure_lib_mimetype.py

@@ -30,13 +30,13 @@ class TestMIMEType(unittest.TestCase):
                 "hello.html",
                 b"#!",
                 "text/html",
-                "ascii" if cchardet is None else "utf-8",
+                "ascii" if cchardet is None else "ASCII",
             ),
             (
                 "hello",
                 b"#!",
                 "text/plain",
-                "ascii" if cchardet is None else "utf-8",
+                "ascii" if cchardet is None else "ASCII",
             ),
             ("hello.jpg", None, "image/jpeg", None),
             ("hello.jpg", b"#!", "image/jpeg", None),
@@ -70,7 +70,7 @@ class TestMIMEType(unittest.TestCase):
                 b"#!",
                 "text/plain; charset=ascii"
                 if cchardet is None
-                else "text/plain; charset=utf-8",
+                else "text/plain; charset=ASCII",
             ),
             ("hello.jpg", None, "image/jpeg"),
             ("hello.jpg", b"#!", "image/jpeg"),