test_pagure_lib_encoding_utils.py 4.1 KB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116
  1. # -*- coding: utf-8 -*-
  2. """
  3. Tests for :module:`pagure.lib.encoding_utils`.
  4. """
  5. from __future__ import unicode_literals, absolute_import
  6. import chardet
  7. import os
  8. import unittest
  9. import sys
  10. sys.path.insert(
  11. 0, os.path.join(os.path.dirname(os.path.abspath(__file__)), "..")
  12. )
  13. from pagure.lib import encoding_utils
  14. class TestGuessEncoding(unittest.TestCase):
  15. def test_guess_encoding_ascii(self):
  16. """
  17. Assert when ascii-only data is provided ascii is the guessed encoding.
  18. """
  19. data = "Twas bryllyg, and the slythy toves did gyre and gymble"
  20. result = encoding_utils.guess_encoding(data.encode("ascii"))
  21. self.assertEqual(result, "ascii")
  22. def test_guess_encoding_favor_utf_8(self):
  23. """
  24. Test that strings that could be UTF-8 or ISO-8859-* result in UTF-8.
  25. python-chardet-3.0.4-2.fc27.noarch detects it as ISO-8859-9
  26. python-chardet-2.2.1-1.el7_1.noarch detects it as ISO-8859-2
  27. """
  28. data = "Šabata".encode("utf-8")
  29. result = encoding_utils.guess_encoding(data)
  30. chardet_result = chardet.detect(data)
  31. self.assertEqual(result, "utf-8")
  32. if chardet.__version__[0] == "3":
  33. self.assertEqual(chardet_result["encoding"], "ISO-8859-9")
  34. else:
  35. self.assertEqual(chardet_result["encoding"], "ISO-8859-2")
  36. def test_guess_encoding_no_data(self):
  37. """ Test encoding_utils.guess_encoding() with an empty string """
  38. result = encoding_utils.guess_encoding("".encode("utf-8"))
  39. self.assertEqual(result, "ascii")
  40. class TestGuessEncodings(unittest.TestCase):
  41. def test_guess_encodings(self):
  42. """ Test the encoding_utils.guess_encodings() method. """
  43. data = "Šabata".encode("utf-8")
  44. result = encoding_utils.guess_encodings(data)
  45. chardet_result = chardet.detect(data)
  46. if chardet.__version__[0] == "3":
  47. # The first three have different confidence values
  48. self.assertListEqual(
  49. [encoding.encoding for encoding in result][:3],
  50. ["utf-8", "ISO-8859-9", "ISO-8859-1"],
  51. )
  52. # This is the one with the least confidence
  53. self.assertEqual(result[-1].encoding, "windows-1255")
  54. # The values in the middle of the list all have the same confidence
  55. # value and can't be sorted reliably: use sets.
  56. self.assertEqual(
  57. set([encoding.encoding for encoding in result]),
  58. set(
  59. [
  60. "utf-8",
  61. "ISO-8859-9",
  62. "ISO-8859-1",
  63. "MacCyrillic",
  64. "IBM866",
  65. "TIS-620",
  66. "EUC-JP",
  67. "EUC-KR",
  68. "GB2312",
  69. "KOI8-R",
  70. "Big5",
  71. "IBM855",
  72. "ISO-8859-7",
  73. "SHIFT_JIS",
  74. "windows-1253",
  75. "CP949",
  76. "EUC-TW",
  77. "ISO-8859-5",
  78. "windows-1251",
  79. "windows-1255",
  80. ]
  81. ),
  82. )
  83. self.assertEqual(chardet_result["encoding"], "ISO-8859-9")
  84. else:
  85. self.assertListEqual(
  86. [encoding.encoding for encoding in result],
  87. ["utf-8", "ISO-8859-2", "windows-1252"],
  88. )
  89. self.assertEqual(chardet_result["encoding"], "ISO-8859-2")
  90. def test_guess_encodings_no_data(self):
  91. """ Test encoding_utils.guess_encodings() with an emtpy string """
  92. result = encoding_utils.guess_encodings("".encode("utf-8"))
  93. self.assertEqual([encoding.encoding for encoding in result], ["ascii"])
  94. class TestDecode(unittest.TestCase):
  95. def test_decode(self):
  96. """ Test encoding_utils.decode() """
  97. data = "Šabata"
  98. self.assertEqual(data, encoding_utils.decode(data.encode("utf-8")))
  99. if __name__ == "__main__":
  100. unittest.main(verbosity=2)