Mercurial > hg > hg-fastimport
comparison hgext3rd/fastimport/vendor/python_fastimport/parser.py @ 86:28704a2a7461 vendor/python-fastimport
Import python-fastimport-0.9.8
| author | Roy Marples <roy@marples.name> |
|---|---|
| date | Tue, 19 Jan 2021 22:56:34 +0000 |
| parents | |
| children | 2fc99e3479d9 |
comparison
equal
deleted
inserted
replaced
| 85:1f5544a8870b | 86:28704a2a7461 |
|---|---|
| 1 # Copyright (C) 2008-2010 Canonical Ltd | |
| 2 # | |
| 3 # This program is free software; you can redistribute it and/or modify | |
| 4 # it under the terms of the GNU General Public License as published by | |
| 5 # the Free Software Foundation; either version 2 of the License, or | |
| 6 # (at your option) any later version. | |
| 7 # | |
| 8 # This program is distributed in the hope that it will be useful, | |
| 9 # but WITHOUT ANY WARRANTY; without even the implied warranty of | |
| 10 # MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the | |
| 11 # GNU General Public License for more details. | |
| 12 # | |
| 13 # You should have received a copy of the GNU General Public License | |
| 14 # along with this program. If not, see <http://www.gnu.org/licenses/>. | |
| 15 | |
| 16 """Parser of import data into command objects. | |
| 17 | |
| 18 In order to reuse existing front-ends, the stream format is a subset of | |
| 19 the one used by git-fast-import (as of the 1.5.4 release of git at least). | |
| 20 The grammar is: | |
| 21 | |
| 22 stream ::= cmd*; | |
| 23 | |
| 24 cmd ::= new_blob | |
| 25 | new_commit | |
| 26 | new_tag | |
| 27 | reset_branch | |
| 28 | checkpoint | |
| 29 | progress | |
| 30 ; | |
| 31 | |
| 32 new_blob ::= 'blob' lf | |
| 33 mark? | |
| 34 file_content; | |
| 35 file_content ::= data; | |
| 36 | |
| 37 new_commit ::= 'commit' sp ref_str lf | |
| 38 mark? | |
| 39 ('author' sp name '<' email '>' when lf)? | |
| 40 'committer' sp name '<' email '>' when lf | |
| 41 commit_msg | |
| 42 ('from' sp (ref_str | hexsha1 | sha1exp_str | idnum) lf)? | |
| 43 ('merge' sp (ref_str | hexsha1 | sha1exp_str | idnum) lf)* | |
| 44 file_change* | |
| 45 lf?; | |
| 46 commit_msg ::= data; | |
| 47 | |
| 48 file_change ::= file_clr | |
| 49 | file_del | |
| 50 | file_rnm | |
| 51 | file_cpy | |
| 52 | file_obm | |
| 53 | file_inm; | |
| 54 file_clr ::= 'deleteall' lf; | |
| 55 file_del ::= 'D' sp path_str lf; | |
| 56 file_rnm ::= 'R' sp path_str sp path_str lf; | |
| 57 file_cpy ::= 'C' sp path_str sp path_str lf; | |
| 58 file_obm ::= 'M' sp mode sp (hexsha1 | idnum) sp path_str lf; | |
| 59 file_inm ::= 'M' sp mode sp 'inline' sp path_str lf | |
| 60 data; | |
| 61 | |
| 62 new_tag ::= 'tag' sp tag_str lf | |
| 63 'from' sp (ref_str | hexsha1 | sha1exp_str | idnum) lf | |
| 64 'tagger' sp name '<' email '>' when lf | |
| 65 tag_msg; | |
| 66 tag_msg ::= data; | |
| 67 | |
| 68 reset_branch ::= 'reset' sp ref_str lf | |
| 69 ('from' sp (ref_str | hexsha1 | sha1exp_str | idnum) lf)? | |
| 70 lf?; | |
| 71 | |
| 72 checkpoint ::= 'checkpoint' lf | |
| 73 lf?; | |
| 74 | |
| 75 progress ::= 'progress' sp not_lf* lf | |
| 76 lf?; | |
| 77 | |
| 78 # note: the first idnum in a stream should be 1 and subsequent | |
| 79 # idnums should not have gaps between values as this will cause | |
| 80 # the stream parser to reserve space for the gapped values. An | |
| 81 # idnum can be updated in the future to a new object by issuing | |
| 82 # a new mark directive with the old idnum. | |
| 83 # | |
| 84 mark ::= 'mark' sp idnum lf; | |
| 85 data ::= (delimited_data | exact_data) | |
| 86 lf?; | |
| 87 | |
| 88 # note: delim may be any string but must not contain lf. | |
| 89 # data_line may contain any data but must not be exactly | |
| 90 # delim. The lf after the final data_line is included in | |
| 91 # the data. | |
| 92 delimited_data ::= 'data' sp '<<' delim lf | |
| 93 (data_line lf)* | |
| 94 delim lf; | |
| 95 | |
| 96 # note: declen indicates the length of binary_data in bytes. | |
| 97 # declen does not include the lf preceeding the binary data. | |
| 98 # | |
| 99 exact_data ::= 'data' sp declen lf | |
| 100 binary_data; | |
| 101 | |
| 102 # note: quoted strings are C-style quoting supporting \c for | |
| 103 # common escapes of 'c' (e..g \n, \t, \\, \") or \nnn where nnn | |
| 104 # is the signed byte value in octal. Note that the only | |
| 105 # characters which must actually be escaped to protect the | |
| 106 # stream formatting is: \, " and LF. Otherwise these values | |
| 107 # are UTF8. | |
| 108 # | |
| 109 ref_str ::= ref; | |
| 110 sha1exp_str ::= sha1exp; | |
| 111 tag_str ::= tag; | |
| 112 path_str ::= path | '"' quoted(path) '"' ; | |
| 113 mode ::= '100644' | '644' | |
| 114 | '100755' | '755' | |
| 115 | '120000' | |
| 116 ; | |
| 117 | |
| 118 declen ::= # unsigned 32 bit value, ascii base10 notation; | |
| 119 bigint ::= # unsigned integer value, ascii base10 notation; | |
| 120 binary_data ::= # file content, not interpreted; | |
| 121 | |
| 122 when ::= raw_when | rfc2822_when; | |
| 123 raw_when ::= ts sp tz; | |
| 124 rfc2822_when ::= # Valid RFC 2822 date and time; | |
| 125 | |
| 126 sp ::= # ASCII space character; | |
| 127 lf ::= # ASCII newline (LF) character; | |
| 128 | |
| 129 # note: a colon (':') must precede the numerical value assigned to | |
| 130 # an idnum. This is to distinguish it from a ref or tag name as | |
| 131 # GIT does not permit ':' in ref or tag strings. | |
| 132 # | |
| 133 idnum ::= ':' bigint; | |
| 134 path ::= # GIT style file path, e.g. "a/b/c"; | |
| 135 ref ::= # GIT ref name, e.g. "refs/heads/MOZ_GECKO_EXPERIMENT"; | |
| 136 tag ::= # GIT tag name, e.g. "FIREFOX_1_5"; | |
| 137 sha1exp ::= # Any valid GIT SHA1 expression; | |
| 138 hexsha1 ::= # SHA1 in hexadecimal format; | |
| 139 | |
| 140 # note: name and email are UTF8 strings, however name must not | |
| 141 # contain '<' or lf and email must not contain any of the | |
| 142 # following: '<', '>', lf. | |
| 143 # | |
| 144 name ::= # valid GIT author/committer name; | |
| 145 email ::= # valid GIT author/committer email; | |
| 146 ts ::= # time since the epoch in seconds, ascii base10 notation; | |
| 147 tz ::= # GIT style timezone; | |
| 148 | |
| 149 # note: comments may appear anywhere in the input, except | |
| 150 # within a data command. Any form of the data command | |
| 151 # always escapes the related input from comment processing. | |
| 152 # | |
| 153 # In case it is not clear, the '#' that starts the comment | |
| 154 # must be the first character on that the line (an lf have | |
| 155 # preceeded it). | |
| 156 # | |
| 157 comment ::= '#' not_lf* lf; | |
| 158 not_lf ::= # Any byte that is not ASCII newline (LF); | |
| 159 """ | |
| 160 from __future__ import print_function | |
| 161 | |
| 162 import collections | |
| 163 import re | |
| 164 import sys | |
| 165 import codecs | |
| 166 | |
| 167 from fastimport import ( | |
| 168 commands, | |
| 169 dates, | |
| 170 errors, | |
| 171 ) | |
| 172 from fastimport.helpers import ( | |
| 173 newobject as object, | |
| 174 utf8_bytes_string, | |
| 175 ) | |
| 176 | |
| 177 | |
| 178 ## Stream parsing ## | |
| 179 | |
| 180 class LineBasedParser(object): | |
| 181 | |
| 182 def __init__(self, input_stream): | |
| 183 """A Parser that keeps track of line numbers. | |
| 184 | |
| 185 :param input: the file-like object to read from | |
| 186 """ | |
| 187 self.input = input_stream | |
| 188 self.lineno = 0 | |
| 189 # Lines pushed back onto the input stream | |
| 190 self._buffer = [] | |
| 191 | |
| 192 def abort(self, exception, *args): | |
| 193 """Raise an exception providing line number information.""" | |
| 194 raise exception(self.lineno, *args) | |
| 195 | |
| 196 def readline(self): | |
| 197 """Get the next line including the newline or '' on EOF.""" | |
| 198 self.lineno += 1 | |
| 199 if self._buffer: | |
| 200 return self._buffer.pop() | |
| 201 else: | |
| 202 return self.input.readline() | |
| 203 | |
| 204 def next_line(self): | |
| 205 """Get the next line without the newline or None on EOF.""" | |
| 206 line = self.readline() | |
| 207 if line: | |
| 208 return line[:-1] | |
| 209 else: | |
| 210 return None | |
| 211 | |
| 212 def push_line(self, line): | |
| 213 """Push line back onto the line buffer. | |
| 214 | |
| 215 :param line: the line with no trailing newline | |
| 216 """ | |
| 217 self.lineno -= 1 | |
| 218 self._buffer.append(line + b'\n') | |
| 219 | |
| 220 def read_bytes(self, count): | |
| 221 """Read a given number of bytes from the input stream. | |
| 222 | |
| 223 Throws MissingBytes if the bytes are not found. | |
| 224 | |
| 225 Note: This method does not read from the line buffer. | |
| 226 | |
| 227 :return: a string | |
| 228 """ | |
| 229 result = self.input.read(count) | |
| 230 found = len(result) | |
| 231 self.lineno += result.count(b'\n') | |
| 232 if found != count: | |
| 233 self.abort(errors.MissingBytes, count, found) | |
| 234 return result | |
| 235 | |
| 236 def read_until(self, terminator): | |
| 237 """Read the input stream until the terminator is found. | |
| 238 | |
| 239 Throws MissingTerminator if the terminator is not found. | |
| 240 | |
| 241 Note: This method does not read from the line buffer. | |
| 242 | |
| 243 :return: the bytes read up to but excluding the terminator. | |
| 244 """ | |
| 245 | |
| 246 lines = [] | |
| 247 term = terminator + b'\n' | |
| 248 while True: | |
| 249 line = self.input.readline() | |
| 250 if line == term: | |
| 251 break | |
| 252 else: | |
| 253 lines.append(line) | |
| 254 return b''.join(lines) | |
| 255 | |
| 256 | |
| 257 # Regular expression used for parsing. (Note: The spec states that the name | |
| 258 # part should be non-empty but git-fast-export doesn't always do that so | |
| 259 # the first bit is \w*, not \w+.) Also git-fast-import code says the | |
| 260 # space before the email is optional. | |
| 261 _WHO_AND_WHEN_RE = re.compile(br'([^<]*)<(.*)> (.+)') | |
| 262 _WHO_RE = re.compile(br'([^<]*)<(.*)>') | |
| 263 | |
| 264 | |
| 265 class ImportParser(LineBasedParser): | |
| 266 | |
| 267 def __init__(self, input_stream, verbose=False, output=sys.stdout, | |
| 268 user_mapper=None, strict=True): | |
| 269 """A Parser of import commands. | |
| 270 | |
| 271 :param input_stream: the file-like object to read from | |
| 272 :param verbose: display extra information of not | |
| 273 :param output: the file-like object to write messages to (YAGNI?) | |
| 274 :param user_mapper: if not None, the UserMapper used to adjust | |
| 275 user-ids for authors, committers and taggers. | |
| 276 :param strict: Raise errors on strictly invalid data | |
| 277 """ | |
| 278 LineBasedParser.__init__(self, input_stream) | |
| 279 self.verbose = verbose | |
| 280 self.output = output | |
| 281 self.user_mapper = user_mapper | |
| 282 self.strict = strict | |
| 283 # We auto-detect the date format when a date is first encountered | |
| 284 self.date_parser = None | |
| 285 self.features = {} | |
| 286 | |
| 287 def warning(self, msg): | |
| 288 sys.stderr.write("warning line %d: %s\n" % (self.lineno, msg)) | |
| 289 | |
| 290 def iter_commands(self): | |
| 291 """Iterator returning ImportCommand objects.""" | |
| 292 while True: | |
| 293 line = self.next_line() | |
| 294 if line is None: | |
| 295 if b'done' in self.features: | |
| 296 raise errors.PrematureEndOfStream(self.lineno) | |
| 297 break | |
| 298 elif len(line) == 0 or line.startswith(b'#'): | |
| 299 continue | |
| 300 # Search for commands in order of likelihood | |
| 301 elif line.startswith(b'commit '): | |
| 302 yield self._parse_commit(line[len(b'commit '):]) | |
| 303 elif line.startswith(b'blob'): | |
| 304 yield self._parse_blob() | |
| 305 elif line.startswith(b'done'): | |
| 306 break | |
| 307 elif line.startswith(b'progress '): | |
| 308 yield commands.ProgressCommand(line[len(b'progress '):]) | |
| 309 elif line.startswith(b'reset '): | |
| 310 yield self._parse_reset(line[len(b'reset '):]) | |
| 311 elif line.startswith(b'tag '): | |
| 312 yield self._parse_tag(line[len(b'tag '):]) | |
| 313 elif line.startswith(b'checkpoint'): | |
| 314 yield commands.CheckpointCommand() | |
| 315 elif line.startswith(b'feature'): | |
| 316 yield self._parse_feature(line[len(b'feature '):]) | |
| 317 else: | |
| 318 self.abort(errors.InvalidCommand, line) | |
| 319 | |
| 320 def iter_file_commands(self): | |
| 321 """Iterator returning FileCommand objects. | |
| 322 | |
| 323 If an invalid file command is found, the line is silently | |
| 324 pushed back and iteration ends. | |
| 325 """ | |
| 326 while True: | |
| 327 line = self.next_line() | |
| 328 if line is None: | |
| 329 break | |
| 330 elif len(line) == 0 or line.startswith(b'#'): | |
| 331 continue | |
| 332 # Search for file commands in order of likelihood | |
| 333 elif line.startswith(b'M '): | |
| 334 yield self._parse_file_modify(line[2:]) | |
| 335 elif line.startswith(b'D '): | |
| 336 path = self._path(line[2:]) | |
| 337 yield commands.FileDeleteCommand(path) | |
| 338 elif line.startswith(b'R '): | |
| 339 old, new = self._path_pair(line[2:]) | |
| 340 yield commands.FileRenameCommand(old, new) | |
| 341 elif line.startswith(b'C '): | |
| 342 src, dest = self._path_pair(line[2:]) | |
| 343 yield commands.FileCopyCommand(src, dest) | |
| 344 elif line.startswith(b'deleteall'): | |
| 345 yield commands.FileDeleteAllCommand() | |
| 346 else: | |
| 347 self.push_line(line) | |
| 348 break | |
| 349 | |
| 350 def _parse_blob(self): | |
| 351 """Parse a blob command.""" | |
| 352 lineno = self.lineno | |
| 353 mark = self._get_mark_if_any() | |
| 354 data = self._get_data(b'blob') | |
| 355 return commands.BlobCommand(mark, data, lineno) | |
| 356 | |
| 357 def _parse_commit(self, ref): | |
| 358 """Parse a commit command.""" | |
| 359 lineno = self.lineno | |
| 360 mark = self._get_mark_if_any() | |
| 361 author = self._get_user_info(b'commit', b'author', False) | |
| 362 more_authors = [] | |
| 363 while True: | |
| 364 another_author = self._get_user_info(b'commit', b'author', False) | |
| 365 if another_author is not None: | |
| 366 more_authors.append(another_author) | |
| 367 else: | |
| 368 break | |
| 369 committer = self._get_user_info(b'commit', b'committer') | |
| 370 message = self._get_data(b'commit', b'message') | |
| 371 from_ = self._get_from() | |
| 372 merges = [] | |
| 373 while True: | |
| 374 merge = self._get_merge() | |
| 375 if merge is not None: | |
| 376 # while the spec suggests it's illegal, git-fast-export | |
| 377 # outputs multiple merges on the one line, e.g. | |
| 378 # merge :x :y :z | |
| 379 these_merges = merge.split(b' ') | |
| 380 merges.extend(these_merges) | |
| 381 else: | |
| 382 break | |
| 383 properties = {} | |
| 384 while True: | |
| 385 name_value = self._get_property() | |
| 386 if name_value is not None: | |
| 387 name, value = name_value | |
| 388 properties[name] = value | |
| 389 else: | |
| 390 break | |
| 391 return commands.CommitCommand(ref, mark, author, committer, message, | |
| 392 from_, merges, list(self.iter_file_commands()), lineno=lineno, | |
| 393 more_authors=more_authors, properties=properties) | |
| 394 | |
| 395 def _parse_feature(self, info): | |
| 396 """Parse a feature command.""" | |
| 397 parts = info.split(b'=', 1) | |
| 398 name = parts[0] | |
| 399 if len(parts) > 1: | |
| 400 value = self._path(parts[1]) | |
| 401 else: | |
| 402 value = None | |
| 403 self.features[name] = value | |
| 404 return commands.FeatureCommand(name, value, lineno=self.lineno) | |
| 405 | |
| 406 def _parse_file_modify(self, info): | |
| 407 """Parse a filemodify command within a commit. | |
| 408 | |
| 409 :param info: a string in the format "mode dataref path" | |
| 410 (where dataref might be the hard-coded literal 'inline'). | |
| 411 """ | |
| 412 params = info.split(b' ', 2) | |
| 413 path = self._path(params[2]) | |
| 414 mode = self._mode(params[0]) | |
| 415 if params[1] == b'inline': | |
| 416 dataref = None | |
| 417 data = self._get_data(b'filemodify') | |
| 418 else: | |
| 419 dataref = params[1] | |
| 420 data = None | |
| 421 return commands.FileModifyCommand(path, mode, dataref, | |
| 422 data) | |
| 423 | |
| 424 def _parse_reset(self, ref): | |
| 425 """Parse a reset command.""" | |
| 426 from_ = self._get_from() | |
| 427 return commands.ResetCommand(ref, from_) | |
| 428 | |
| 429 def _parse_tag(self, name): | |
| 430 """Parse a tag command.""" | |
| 431 from_ = self._get_from(b'tag') | |
| 432 tagger = self._get_user_info(b'tag', b'tagger', | |
| 433 accept_just_who=True) | |
| 434 message = self._get_data(b'tag', b'message') | |
| 435 return commands.TagCommand(name, from_, tagger, message) | |
| 436 | |
| 437 def _get_mark_if_any(self): | |
| 438 """Parse a mark section.""" | |
| 439 line = self.next_line() | |
| 440 if line.startswith(b'mark :'): | |
| 441 return line[len(b'mark :'):] | |
| 442 else: | |
| 443 self.push_line(line) | |
| 444 return None | |
| 445 | |
| 446 def _get_from(self, required_for=None): | |
| 447 """Parse a from section.""" | |
| 448 line = self.next_line() | |
| 449 if line is None: | |
| 450 return None | |
| 451 elif line.startswith(b'from '): | |
| 452 return line[len(b'from '):] | |
| 453 elif required_for: | |
| 454 self.abort(errors.MissingSection, required_for, 'from') | |
| 455 else: | |
| 456 self.push_line(line) | |
| 457 return None | |
| 458 | |
| 459 def _get_merge(self): | |
| 460 """Parse a merge section.""" | |
| 461 line = self.next_line() | |
| 462 if line is None: | |
| 463 return None | |
| 464 elif line.startswith(b'merge '): | |
| 465 return line[len(b'merge '):] | |
| 466 else: | |
| 467 self.push_line(line) | |
| 468 return None | |
| 469 | |
| 470 def _get_property(self): | |
| 471 """Parse a property section.""" | |
| 472 line = self.next_line() | |
| 473 if line is None: | |
| 474 return None | |
| 475 elif line.startswith(b'property '): | |
| 476 return self._name_value(line[len(b'property '):]) | |
| 477 else: | |
| 478 self.push_line(line) | |
| 479 return None | |
| 480 | |
| 481 def _get_user_info(self, cmd, section, required=True, | |
| 482 accept_just_who=False): | |
| 483 """Parse a user section.""" | |
| 484 line = self.next_line() | |
| 485 if line.startswith(section + b' '): | |
| 486 return self._who_when(line[len(section + b' '):], cmd, section, | |
| 487 accept_just_who=accept_just_who) | |
| 488 elif required: | |
| 489 self.abort(errors.MissingSection, cmd, section) | |
| 490 else: | |
| 491 self.push_line(line) | |
| 492 return None | |
| 493 | |
| 494 def _get_data(self, required_for, section=b'data'): | |
| 495 """Parse a data section.""" | |
| 496 line = self.next_line() | |
| 497 if line.startswith(b'data '): | |
| 498 rest = line[len(b'data '):] | |
| 499 if rest.startswith(b'<<'): | |
| 500 return self.read_until(rest[2:]) | |
| 501 else: | |
| 502 size = int(rest) | |
| 503 read_bytes = self.read_bytes(size) | |
| 504 # optional LF after data. | |
| 505 next_line = self.input.readline() | |
| 506 self.lineno += 1 | |
| 507 if len(next_line) > 1 or next_line != b'\n': | |
| 508 self.push_line(next_line[:-1]) | |
| 509 return read_bytes | |
| 510 else: | |
| 511 self.abort(errors.MissingSection, required_for, section) | |
| 512 | |
| 513 def _who_when(self, s, cmd, section, accept_just_who=False): | |
| 514 """Parse who and when information from a string. | |
| 515 | |
| 516 :return: a tuple of (name,email,timestamp,timezone). name may be | |
| 517 the empty string if only an email address was given. | |
| 518 """ | |
| 519 match = _WHO_AND_WHEN_RE.search(s) | |
| 520 if match: | |
| 521 datestr = match.group(3).lstrip() | |
| 522 if self.date_parser is None: | |
| 523 # auto-detect the date format | |
| 524 if len(datestr.split(b' ')) == 2: | |
| 525 date_format = 'raw' | |
| 526 elif datestr == b'now': | |
| 527 date_format = 'now' | |
| 528 else: | |
| 529 date_format = 'rfc2822' | |
| 530 self.date_parser = dates.DATE_PARSERS_BY_NAME[date_format] | |
| 531 try: | |
| 532 when = self.date_parser(datestr, self.lineno) | |
| 533 except ValueError: | |
| 534 print("failed to parse datestr '%s'" % (datestr,)) | |
| 535 raise | |
| 536 name = match.group(1).rstrip() | |
| 537 email = match.group(2) | |
| 538 else: | |
| 539 match = _WHO_RE.search(s) | |
| 540 if accept_just_who and match: | |
| 541 # HACK around missing time | |
| 542 # TODO: output a warning here | |
| 543 when = dates.DATE_PARSERS_BY_NAME['now']('now') | |
| 544 name = match.group(1) | |
| 545 email = match.group(2) | |
| 546 elif self.strict: | |
| 547 self.abort(errors.BadFormat, cmd, section, s) | |
| 548 else: | |
| 549 name = s | |
| 550 email = None | |
| 551 when = dates.DATE_PARSERS_BY_NAME['now']('now') | |
| 552 if len(name) > 0: | |
| 553 if name.endswith(b' '): | |
| 554 name = name[:-1] | |
| 555 # While it shouldn't happen, some datasets have email addresses | |
| 556 # which contain unicode characters. See bug 338186. We sanitize | |
| 557 # the data at this level just in case. | |
| 558 if self.user_mapper: | |
| 559 name, email = self.user_mapper.map_name_and_email(name, email) | |
| 560 | |
| 561 return Authorship(name, email, when[0], when[1]) | |
| 562 | |
| 563 def _name_value(self, s): | |
| 564 """Parse a (name,value) tuple from 'name value-length value'.""" | |
| 565 parts = s.split(b' ', 2) | |
| 566 name = parts[0] | |
| 567 if len(parts) == 1: | |
| 568 value = None | |
| 569 else: | |
| 570 size = int(parts[1]) | |
| 571 value = parts[2] | |
| 572 still_to_read = size - len(value) | |
| 573 if still_to_read > 0: | |
| 574 read_bytes = self.read_bytes(still_to_read) | |
| 575 value += b'\n' + read_bytes[:still_to_read - 1] | |
| 576 return (name, value) | |
| 577 | |
| 578 def _path(self, s): | |
| 579 """Parse a path.""" | |
| 580 if s.startswith(b'"'): | |
| 581 if not s.endswith(b'"'): | |
| 582 self.abort(errors.BadFormat, '?', '?', s) | |
| 583 else: | |
| 584 return _unquote_c_string(s[1:-1]) | |
| 585 return s | |
| 586 | |
| 587 def _path_pair(self, s): | |
| 588 """Parse two paths separated by a space.""" | |
| 589 # TODO: handle a space in the first path | |
| 590 if s.startswith(b'"'): | |
| 591 parts = s[1:].split(b'" ', 1) | |
| 592 else: | |
| 593 parts = s.split(b' ', 1) | |
| 594 if len(parts) != 2: | |
| 595 self.abort(errors.BadFormat, '?', '?', s) | |
| 596 elif parts[1].startswith(b'"') and parts[1].endswith(b'"'): | |
| 597 parts[1] = parts[1][1:-1] | |
| 598 elif parts[1].startswith(b'"') or parts[1].endswith(b'"'): | |
| 599 self.abort(errors.BadFormat, '?', '?', s) | |
| 600 return [_unquote_c_string(s) for s in parts] | |
| 601 | |
| 602 def _mode(self, s): | |
| 603 """Check file mode format and parse into an int. | |
| 604 | |
| 605 :return: mode as integer | |
| 606 """ | |
| 607 # Note: Output from git-fast-export slightly different to spec | |
| 608 if s in [b'644', b'100644', b'0100644']: | |
| 609 return 0o100644 | |
| 610 elif s in [b'755', b'100755', b'0100755']: | |
| 611 return 0o100755 | |
| 612 elif s in [b'040000', b'0040000']: | |
| 613 return 0o40000 | |
| 614 elif s in [b'120000', b'0120000']: | |
| 615 return 0o120000 | |
| 616 elif s in [b'160000', b'0160000']: | |
| 617 return 0o160000 | |
| 618 else: | |
| 619 self.abort(errors.BadFormat, 'filemodify', 'mode', s) | |
| 620 | |
| 621 | |
| 622 ESCAPE_SEQUENCE_BYTES_RE = re.compile(br''' | |
| 623 ( \\U........ # 8-digit hex escapes | |
| 624 | \\u.... # 4-digit hex escapes | |
| 625 | \\x.. # 2-digit hex escapes | |
| 626 | \\[0-7]{1,3} # Octal escapes | |
| 627 | \\N\{[^}]+\} # Unicode characters by name | |
| 628 | \\[\\'"abfnrtv] # Single-character escapes | |
| 629 )''', re.VERBOSE | |
| 630 ) | |
| 631 | |
| 632 ESCAPE_SEQUENCE_RE = re.compile(r''' | |
| 633 ( \\U........ | |
| 634 | \\u.... | |
| 635 | \\x.. | |
| 636 | \\[0-7]{1,3} | |
| 637 | \\N\{[^}]+\} | |
| 638 | \\[\\'"abfnrtv] | |
| 639 )''', re.UNICODE | re.VERBOSE | |
| 640 ) | |
| 641 | |
| 642 def _unquote_c_string(s): | |
| 643 """replace C-style escape sequences (\n, \", etc.) with real chars.""" | |
| 644 | |
| 645 # doing a s.encode('utf-8').decode('unicode_escape') can return an | |
| 646 # incorrect output with unicode string (both in py2 and py3) the safest way | |
| 647 # is to match the escape sequences and decoding them alone. | |
| 648 def decode_match(match): | |
| 649 return utf8_bytes_string( | |
| 650 codecs.decode(match.group(0), 'unicode-escape') | |
| 651 ) | |
| 652 | |
| 653 if sys.version_info[0] >= 3 and isinstance(s, bytes): | |
| 654 return ESCAPE_SEQUENCE_BYTES_RE.sub(decode_match, s) | |
| 655 else: | |
| 656 return ESCAPE_SEQUENCE_RE.sub(decode_match, s) | |
| 657 | |
| 658 | |
| 659 Authorship = collections.namedtuple('Authorship', 'name email timestamp timezone') |
