test_pagure_lib_encoding_utils.py 3.6 KB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899
  1. # -*- coding: utf-8 -*-
  2. """
  3. Tests for :module:`pagure.lib.encoding_utils`.
  4. """
  5. from __future__ import unicode_literals
  6. import chardet
  7. import os
  8. import unittest
  9. import sys
  10. sys.path.insert(0, os.path.join(os.path.dirname(
  11. os.path.abspath(__file__)), '..'))
  12. from pagure.lib import encoding_utils
  13. class TestGuessEncoding(unittest.TestCase):
  14. def test_guess_encoding_ascii(self):
  15. """
  16. Assert when ascii-only data is provided ascii is the guessed encoding.
  17. """
  18. data = 'Twas bryllyg, and the slythy toves did gyre and gymble'
  19. result = encoding_utils.guess_encoding(data.encode('ascii'))
  20. self.assertEqual(result, 'ascii')
  21. def test_guess_encoding_favor_utf_8(self):
  22. """
  23. Test that strings that could be UTF-8 or ISO-8859-* result in UTF-8.
  24. python-chardet-3.0.4-2.fc27.noarch detects it as ISO-8859-9
  25. python-chardet-2.2.1-1.el7_1.noarch detects it as ISO-8859-2
  26. """
  27. data = 'Šabata'.encode('utf-8')
  28. result = encoding_utils.guess_encoding(data)
  29. chardet_result = chardet.detect(data)
  30. self.assertEqual(result, 'utf-8')
  31. if chardet.__version__[0] == '3':
  32. self.assertEqual(chardet_result['encoding'], 'ISO-8859-9')
  33. else:
  34. self.assertEqual(chardet_result['encoding'], 'ISO-8859-2')
  35. def test_guess_encoding_no_data(self):
  36. """ Test encoding_utils.guess_encoding() with an empty string """
  37. result = encoding_utils.guess_encoding(''.encode('utf-8'))
  38. self.assertEqual(result, 'ascii')
  39. class TestGuessEncodings(unittest.TestCase):
  40. def test_guess_encodings(self):
  41. """ Test the encoding_utils.guess_encodings() method. """
  42. data = 'Šabata'.encode('utf-8')
  43. result = encoding_utils.guess_encodings(data)
  44. chardet_result = chardet.detect(data)
  45. if chardet.__version__[0] == '3':
  46. # The first three have different confidence values
  47. self.assertListEqual(
  48. [encoding.encoding for encoding in result][:3],
  49. ['utf-8', 'ISO-8859-9', 'ISO-8859-1']
  50. )
  51. # This is the one with the least confidence
  52. self.assertEqual(result[-1].encoding, 'windows-1255')
  53. # The values in the middle of the list all have the same confidence
  54. # value and can't be sorted reliably: use sets.
  55. self.assertEqual(
  56. set([encoding.encoding for encoding in result]),
  57. set(['utf-8', 'ISO-8859-9', 'ISO-8859-1', 'MacCyrillic',
  58. 'IBM866', 'TIS-620', 'EUC-JP', 'EUC-KR', 'GB2312',
  59. 'KOI8-R', 'Big5', 'IBM855', 'ISO-8859-7', 'SHIFT_JIS',
  60. 'windows-1253', 'CP949', 'EUC-TW', 'ISO-8859-5',
  61. 'windows-1251', 'windows-1255'])
  62. )
  63. self.assertEqual(chardet_result['encoding'], 'ISO-8859-9')
  64. else:
  65. self.assertListEqual(
  66. [encoding.encoding for encoding in result],
  67. ['utf-8', 'ISO-8859-2', 'windows-1252'])
  68. self.assertEqual(chardet_result['encoding'], 'ISO-8859-2')
  69. def test_guess_encodings_no_data(self):
  70. """ Test encoding_utils.guess_encodings() with an emtpy string """
  71. result = encoding_utils.guess_encodings(''.encode('utf-8'))
  72. self.assertEqual(
  73. [encoding.encoding for encoding in result],
  74. ['ascii'])
  75. class TestDecode(unittest.TestCase):
  76. def test_decode(self):
  77. """ Test encoding_utils.decode() """
  78. data = 'Šabata'
  79. self.assertEqual(data, encoding_utils.decode(data.encode('utf-8')))
  80. if __name__ == '__main__':
  81. unittest.main(verbosity=2)