test_pagure_lib_encoding_utils.py 3.6 KB

12345678910111213141516171819202122232425262728293031323334353637383940414243444546474849505152535455565758596061626364656667686970717273747576777879808182838485868788899091929394959697
  1. # -*- coding: utf-8 -*-
  2. """
  3. Tests for :module:`pagure.lib.encoding_utils`.
  4. """
  5. import chardet
  6. import os
  7. import unittest
  8. import sys
  9. sys.path.insert(0, os.path.join(os.path.dirname(
  10. os.path.abspath(__file__)), '..'))
  11. from pagure.lib import encoding_utils
  12. class TestGuessEncoding(unittest.TestCase):
  13. def test_guess_encoding_ascii(self):
  14. """
  15. Assert when ascii-only data is provided ascii is the guessed encoding.
  16. """
  17. data = u'Twas bryllyg, and the slythy toves did gyre and gymble'
  18. result = encoding_utils.guess_encoding(data.encode('ascii'))
  19. self.assertEqual(result, 'ascii')
  20. def test_guess_encoding_favor_utf_8(self):
  21. """
  22. Test that strings that could be UTF-8 or ISO-8859-* result in UTF-8.
  23. python-chardet-3.0.4-2.fc27.noarch detects it as ISO-8859-9
  24. python-chardet-2.2.1-1.el7_1.noarch detects it as ISO-8859-2
  25. """
  26. data = u'Šabata'.encode('utf-8')
  27. result = encoding_utils.guess_encoding(data)
  28. chardet_result = chardet.detect(data)
  29. self.assertEqual(result, 'utf-8')
  30. if chardet.__version__[0] == '3':
  31. self.assertEqual(chardet_result['encoding'], 'ISO-8859-9')
  32. else:
  33. self.assertEqual(chardet_result['encoding'], 'ISO-8859-2')
  34. def test_guess_encoding_no_data(self):
  35. """ Test encoding_utils.guess_encoding() with an empty string """
  36. result = encoding_utils.guess_encoding(u''.encode('utf-8'))
  37. self.assertEqual(result, 'ascii')
  38. class TestGuessEncodings(unittest.TestCase):
  39. def test_guess_encodings(self):
  40. """ Test the encoding_utils.guess_encodings() method. """
  41. data = u'Šabata'.encode('utf-8')
  42. result = encoding_utils.guess_encodings(data)
  43. chardet_result = chardet.detect(data)
  44. if chardet.__version__[0] == '3':
  45. # The first three have different confidence values
  46. self.assertListEqual(
  47. [encoding.encoding for encoding in result][:3],
  48. ['utf-8', 'ISO-8859-9', 'ISO-8859-1']
  49. )
  50. # This is the one with the least confidence
  51. self.assertEqual(result[-1].encoding, 'windows-1255')
  52. # The values in the middle of the list all have the same confidence
  53. # value and can't be sorted reliably: use sets.
  54. self.assertEqual(
  55. set([encoding.encoding for encoding in result]),
  56. set(['utf-8', 'ISO-8859-9', 'ISO-8859-1', 'MacCyrillic',
  57. 'IBM866', 'TIS-620', 'EUC-JP', 'EUC-KR', 'GB2312',
  58. 'KOI8-R', 'Big5', 'IBM855', 'ISO-8859-7', 'SHIFT_JIS',
  59. 'windows-1253', 'CP949', 'EUC-TW', 'ISO-8859-5',
  60. 'windows-1251', 'windows-1255'])
  61. )
  62. self.assertEqual(chardet_result['encoding'], 'ISO-8859-9')
  63. else:
  64. self.assertListEqual(
  65. [encoding.encoding for encoding in result],
  66. ['utf-8', 'ISO-8859-2', 'windows-1252'])
  67. self.assertEqual(chardet_result['encoding'], 'ISO-8859-2')
  68. def test_guess_encodings_no_data(self):
  69. """ Test encoding_utils.guess_encodings() with an emtpy string """
  70. result = encoding_utils.guess_encodings(u''.encode('utf-8'))
  71. self.assertEqual(
  72. [encoding.encoding for encoding in result],
  73. ['ascii'])
  74. class TestDecode(unittest.TestCase):
  75. def test_decode(self):
  76. """ Test encoding_utils.decode() """
  77. data = u'Šabata'
  78. self.assertEqual(data, encoding_utils.decode(data.encode('utf-8')))
  79. if __name__ == '__main__':
  80. unittest.main(verbosity=2)