patch.py 16 KB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199200201202203204205206207208209210211212213214215216217218219220221222223224225226227228229230231232233234235236237238239240241242243244245246247248249250251252253254255256257258259260261262263264265266267268269270271272273274275276277278279280281282283284285286287288289290291292293294295296297298299300301302303304305306307308309310311312313314315316317318319320321322323324325326327328329330331332333334335336337338339340341342343344345346347348349350351352353354355356357358359360361362363364365366367368369370371372373374375376377378379380381382383384385386387388389390391392393394395396397398399400401402403404405406407408409410411412413414415416417418419420421422423424425426427428429430431432433434435436437438439440441442443444445446447448449450451452453454455456457458459460461462463
  1. # -*- coding: utf-8 -*-
  2. # The MIT License (MIT)
  3. # Copyright (c) 2014-2017 Matias Bordese
  4. #
  5. # Permission is hereby granted, free of charge, to any person obtaining a copy
  6. # of this software and associated documentation files (the "Software"), to deal
  7. # in the Software without restriction, including without limitation the rights
  8. # to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
  9. # copies of the Software, and to permit persons to whom the Software is
  10. # furnished to do so, subject to the following conditions:
  11. #
  12. # The above copyright notice and this permission notice shall be included in
  13. # all copies or substantial portions of the Software.
  14. #
  15. # THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
  16. # EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
  17. # MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.
  18. # IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM,
  19. # DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR
  20. # OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE
  21. # OR OTHER DEALINGS IN THE SOFTWARE.
  22. """Classes used by the unified diff parser to keep the diff data."""
  23. from __future__ import unicode_literals
  24. import codecs
  25. import sys
  26. from .constants import (
  27. DEFAULT_ENCODING,
  28. LINE_TYPE_ADDED,
  29. LINE_TYPE_CONTEXT,
  30. LINE_TYPE_EMPTY,
  31. LINE_TYPE_REMOVED,
  32. LINE_TYPE_NO_NEWLINE,
  33. LINE_VALUE_NO_NEWLINE,
  34. RE_HUNK_BODY_LINE,
  35. RE_HUNK_EMPTY_BODY_LINE,
  36. RE_HUNK_HEADER,
  37. RE_SOURCE_FILENAME,
  38. RE_TARGET_FILENAME,
  39. RE_NO_NEWLINE_MARKER,
  40. )
  41. from .errors import UnidiffParseError
  42. PY2 = sys.version_info[0] == 2
  43. if PY2:
  44. from StringIO import StringIO
  45. open_file = codecs.open
  46. make_str = lambda x: x.encode(DEFAULT_ENCODING)
  47. def implements_to_string(cls):
  48. cls.__unicode__ = cls.__str__
  49. cls.__str__ = lambda x: x.__unicode__().encode(DEFAULT_ENCODING)
  50. return cls
  51. else:
  52. from io import StringIO
  53. open_file = open
  54. make_str = str
  55. implements_to_string = lambda x: x
  56. unicode = str
  57. basestring = str
  58. @implements_to_string
  59. class Line(object):
  60. """A diff line."""
  61. def __init__(self, value, line_type,
  62. source_line_no=None, target_line_no=None, diff_line_no=None):
  63. super(Line, self).__init__()
  64. self.source_line_no = source_line_no
  65. self.target_line_no = target_line_no
  66. self.diff_line_no = diff_line_no
  67. self.line_type = line_type
  68. self.value = value
  69. def __repr__(self):
  70. return make_str("<Line: %s%s>") % (self.line_type, self.value)
  71. def __str__(self):
  72. return "%s%s" % (self.line_type, self.value)
  73. def __eq__(self, other):
  74. return (self.source_line_no == other.source_line_no and
  75. self.target_line_no == other.target_line_no and
  76. self.diff_line_no == other.diff_line_no and
  77. self.line_type == other.line_type and
  78. self.value == other.value)
  79. @property
  80. def is_added(self):
  81. return self.line_type == LINE_TYPE_ADDED
  82. @property
  83. def is_removed(self):
  84. return self.line_type == LINE_TYPE_REMOVED
  85. @property
  86. def is_context(self):
  87. return self.line_type == LINE_TYPE_CONTEXT
  88. @implements_to_string
  89. class PatchInfo(list):
  90. """Lines with extended patch info.
  91. Format of this info is not documented and it very much depends on
  92. patch producer.
  93. """
  94. def __repr__(self):
  95. value = "<PatchInfo: %s>" % self[0].strip()
  96. return make_str(value)
  97. def __str__(self):
  98. return ''.join(unicode(line) for line in self)
  99. @implements_to_string
  100. class Hunk(list):
  101. """Each of the modified blocks of a file."""
  102. def __init__(self, src_start=0, src_len=0, tgt_start=0, tgt_len=0,
  103. section_header=''):
  104. if src_len is None:
  105. src_len = 1
  106. if tgt_len is None:
  107. tgt_len = 1
  108. self.added = 0 # number of added lines
  109. self.removed = 0 # number of removed lines
  110. self.source = []
  111. self.source_start = int(src_start)
  112. self.source_length = int(src_len)
  113. self.target = []
  114. self.target_start = int(tgt_start)
  115. self.target_length = int(tgt_len)
  116. self.section_header = section_header
  117. def __repr__(self):
  118. value = "<Hunk: @@ %d,%d %d,%d @@ %s>" % (self.source_start,
  119. self.source_length,
  120. self.target_start,
  121. self.target_length,
  122. self.section_header)
  123. return make_str(value)
  124. def __str__(self):
  125. # section header is optional and thus we output it only if it's present
  126. head = "@@ -%d,%d +%d,%d @@%s\n" % (
  127. self.source_start, self.source_length,
  128. self.target_start, self.target_length,
  129. ' ' + self.section_header if self.section_header else '')
  130. content = ''.join(unicode(line) for line in self)
  131. return head + content
  132. def append(self, line):
  133. """Append the line to hunk, and keep track of source/target lines."""
  134. super(Hunk, self).append(line)
  135. s = str(line)
  136. if line.is_added:
  137. self.added += 1
  138. self.target.append(s)
  139. elif line.is_removed:
  140. self.removed += 1
  141. self.source.append(s)
  142. elif line.is_context:
  143. self.target.append(s)
  144. self.source.append(s)
  145. def is_valid(self):
  146. """Check hunk header data matches entered lines info."""
  147. return (len(self.source) == self.source_length and
  148. len(self.target) == self.target_length)
  149. def source_lines(self):
  150. """Hunk lines from source file (generator)."""
  151. return (l for l in self if l.is_context or l.is_removed)
  152. def target_lines(self):
  153. """Hunk lines from target file (generator)."""
  154. return (l for l in self if l.is_context or l.is_added)
  155. class PatchedFile(list):
  156. """Patch updated file, it is a list of Hunks."""
  157. def __init__(self, patch_info=None, source='', target='',
  158. source_timestamp=None, target_timestamp=None):
  159. super(PatchedFile, self).__init__()
  160. self.patch_info = patch_info
  161. self.source_file = source
  162. self.source_timestamp = source_timestamp
  163. self.target_file = target
  164. self.target_timestamp = target_timestamp
  165. def __repr__(self):
  166. return make_str("<PatchedFile: %s>") % make_str(self.path)
  167. def __str__(self):
  168. # patch info is optional
  169. info = '' if self.patch_info is None else str(self.patch_info)
  170. source = "--- %s%s\n" % (
  171. self.source_file,
  172. '\t' + self.source_timestamp if self.source_timestamp else '')
  173. target = "+++ %s%s\n" % (
  174. self.target_file,
  175. '\t' + self.target_timestamp if self.target_timestamp else '')
  176. hunks = ''.join(unicode(hunk) for hunk in self)
  177. return info + source + target + hunks
  178. def _parse_hunk(self, header, diff, encoding):
  179. """Parse hunk details."""
  180. header_info = RE_HUNK_HEADER.match(header)
  181. hunk_info = header_info.groups()
  182. hunk = Hunk(*hunk_info)
  183. source_line_no = hunk.source_start
  184. target_line_no = hunk.target_start
  185. expected_source_end = source_line_no + hunk.source_length
  186. expected_target_end = target_line_no + hunk.target_length
  187. for diff_line_no, line in diff:
  188. if encoding is not None:
  189. line = line.decode(encoding)
  190. valid_line = RE_HUNK_EMPTY_BODY_LINE.match(line)
  191. if not valid_line:
  192. valid_line = RE_HUNK_BODY_LINE.match(line)
  193. if not valid_line:
  194. raise UnidiffParseError('Hunk diff line expected: %s' % line)
  195. line_type = valid_line.group('line_type')
  196. if line_type == LINE_TYPE_EMPTY:
  197. line_type = LINE_TYPE_CONTEXT
  198. value = valid_line.group('value')
  199. original_line = Line(value, line_type=line_type)
  200. if line_type == LINE_TYPE_ADDED:
  201. original_line.target_line_no = target_line_no
  202. target_line_no += 1
  203. elif line_type == LINE_TYPE_REMOVED:
  204. original_line.source_line_no = source_line_no
  205. source_line_no += 1
  206. elif line_type == LINE_TYPE_CONTEXT:
  207. original_line.target_line_no = target_line_no
  208. target_line_no += 1
  209. original_line.source_line_no = source_line_no
  210. source_line_no += 1
  211. elif line_type == LINE_TYPE_NO_NEWLINE:
  212. pass
  213. else:
  214. original_line = None
  215. # stop parsing if we got past expected number of lines
  216. if (source_line_no > expected_source_end or
  217. target_line_no > expected_target_end):
  218. raise UnidiffParseError('Hunk is longer than expected')
  219. if original_line:
  220. original_line.diff_line_no = diff_line_no
  221. hunk.append(original_line)
  222. # if hunk source/target lengths are ok, hunk is complete
  223. if (source_line_no == expected_source_end and
  224. target_line_no == expected_target_end):
  225. break
  226. # report an error if we haven't got expected number of lines
  227. if (source_line_no < expected_source_end or
  228. target_line_no < expected_target_end):
  229. raise UnidiffParseError('Hunk is shorter than expected')
  230. self.append(hunk)
  231. def _add_no_newline_marker_to_last_hunk(self):
  232. if not self:
  233. raise UnidiffParseError(
  234. 'Unexpected marker:' + LINE_VALUE_NO_NEWLINE)
  235. last_hunk = self[-1]
  236. last_hunk.append(
  237. Line(LINE_VALUE_NO_NEWLINE + '\n', line_type=LINE_TYPE_NO_NEWLINE))
  238. def _append_trailing_empty_line(self):
  239. if not self:
  240. raise UnidiffParseError('Unexpected trailing newline character')
  241. last_hunk = self[-1]
  242. last_hunk.append(Line('\n', line_type=LINE_TYPE_EMPTY))
  243. @property
  244. def path(self):
  245. """Return the file path abstracted from VCS."""
  246. if (self.source_file.startswith('a/') and
  247. self.target_file.startswith('b/')):
  248. filepath = self.source_file[2:]
  249. elif (self.source_file.startswith('a/') and
  250. self.target_file == '/dev/null'):
  251. filepath = self.source_file[2:]
  252. elif (self.target_file.startswith('b/') and
  253. self.source_file == '/dev/null'):
  254. filepath = self.target_file[2:]
  255. else:
  256. filepath = self.source_file
  257. return filepath
  258. @property
  259. def added(self):
  260. """Return the file total added lines."""
  261. return sum([hunk.added for hunk in self])
  262. @property
  263. def removed(self):
  264. """Return the file total removed lines."""
  265. return sum([hunk.removed for hunk in self])
  266. @property
  267. def is_added_file(self):
  268. """Return True if this patch adds the file."""
  269. return (len(self) == 1 and self[0].source_start == 0 and
  270. self[0].source_length == 0)
  271. @property
  272. def is_removed_file(self):
  273. """Return True if this patch removes the file."""
  274. return (len(self) == 1 and self[0].target_start == 0 and
  275. self[0].target_length == 0)
  276. @property
  277. def is_modified_file(self):
  278. """Return True if this patch modifies the file."""
  279. return not (self.is_added_file or self.is_removed_file)
  280. @implements_to_string
  281. class PatchSet(list):
  282. """A list of PatchedFiles."""
  283. def __init__(self, f, encoding=None):
  284. super(PatchSet, self).__init__()
  285. # convert string inputs to StringIO objects
  286. if isinstance(f, basestring):
  287. f = self._convert_string(f, encoding)
  288. # make sure we pass an iterator object to parse
  289. data = iter(f)
  290. # if encoding is None, assume we are reading unicode data
  291. self._parse(data, encoding=encoding)
  292. def __repr__(self):
  293. return make_str('<PatchSet: %s>') % super(PatchSet, self).__repr__()
  294. def __str__(self):
  295. return ''.join(unicode(patched_file) for patched_file in self)
  296. def _parse(self, diff, encoding):
  297. current_file = None
  298. patch_info = None
  299. diff = enumerate(diff, 1)
  300. for unused_diff_line_no, line in diff:
  301. if encoding is not None:
  302. line = line.decode(encoding)
  303. # check for source file header
  304. is_source_filename = RE_SOURCE_FILENAME.match(line)
  305. if is_source_filename:
  306. source_file = is_source_filename.group('filename')
  307. source_timestamp = is_source_filename.group('timestamp')
  308. # reset current file
  309. current_file = None
  310. continue
  311. # check for target file header
  312. is_target_filename = RE_TARGET_FILENAME.match(line)
  313. if is_target_filename:
  314. if current_file is not None:
  315. raise UnidiffParseError('Target without source: %s' % line)
  316. target_file = is_target_filename.group('filename')
  317. target_timestamp = is_target_filename.group('timestamp')
  318. # add current file to PatchSet
  319. current_file = PatchedFile(
  320. patch_info, source_file, target_file,
  321. source_timestamp, target_timestamp)
  322. self.append(current_file)
  323. patch_info = None
  324. continue
  325. # check for hunk header
  326. is_hunk_header = RE_HUNK_HEADER.match(line)
  327. if is_hunk_header:
  328. if current_file is None:
  329. raise UnidiffParseError('Unexpected hunk found: %s' % line)
  330. current_file._parse_hunk(line, diff, encoding)
  331. continue
  332. # check for no newline marker
  333. is_no_newline = RE_NO_NEWLINE_MARKER.match(line)
  334. if is_no_newline:
  335. if current_file is None:
  336. raise UnidiffParseError('Unexpected marker: %s' % line)
  337. current_file._add_no_newline_marker_to_last_hunk()
  338. continue
  339. # sometimes hunks can be followed by empty lines
  340. if line == '\n' and current_file is not None:
  341. current_file._append_trailing_empty_line()
  342. continue
  343. # if nothing has matched above then this line is a patch info
  344. if patch_info is None:
  345. current_file = None
  346. patch_info = PatchInfo()
  347. patch_info.append(line)
  348. @classmethod
  349. def from_filename(cls, filename, encoding=DEFAULT_ENCODING, errors=None):
  350. """Return a PatchSet instance given a diff filename."""
  351. with open_file(filename, 'r', encoding=encoding, errors=errors) as f:
  352. instance = cls(f)
  353. return instance
  354. @staticmethod
  355. def _convert_string(data, encoding=None, errors='strict'):
  356. if encoding is not None:
  357. # if encoding is given, assume bytes and decode
  358. data = unicode(data, encoding=encoding, errors=errors)
  359. return StringIO(data)
  360. @classmethod
  361. def from_string(cls, data, encoding=None, errors='strict'):
  362. """Return a PatchSet instance given a diff string."""
  363. return cls(cls._convert_string(data, encoding, errors))
  364. @property
  365. def added_files(self):
  366. """Return patch added files as a list."""
  367. return [f for f in self if f.is_added_file]
  368. @property
  369. def removed_files(self):
  370. """Return patch removed files as a list."""
  371. return [f for f in self if f.is_removed_file]
  372. @property
  373. def modified_files(self):
  374. """Return patch modified files as a list."""
  375. return [f for f in self if f.is_modified_file]
  376. @property
  377. def added(self):
  378. """Return the patch total added lines."""
  379. return sum([f.added for f in self])
  380. @property
  381. def removed(self):
  382. """Return the patch total removed lines."""
  383. return sum([f.removed for f in self])