comparison hgext3rd/fastimport/vendor/python_fastimport/parser.py @ 86:28704a2a7461 vendor/python-fastimport

Import python-fastimport-0.9.8
author Roy Marples <roy@marples.name>
date Tue, 19 Jan 2021 22:56:34 +0000
parents
children 2fc99e3479d9
comparison
equal deleted inserted replaced
85:1f5544a8870b 86:28704a2a7461
1 # Copyright (C) 2008-2010 Canonical Ltd
2 #
3 # This program is free software; you can redistribute it and/or modify
4 # it under the terms of the GNU General Public License as published by
5 # the Free Software Foundation; either version 2 of the License, or
6 # (at your option) any later version.
7 #
8 # This program is distributed in the hope that it will be useful,
9 # but WITHOUT ANY WARRANTY; without even the implied warranty of
10 # MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
11 # GNU General Public License for more details.
12 #
13 # You should have received a copy of the GNU General Public License
14 # along with this program. If not, see <http://www.gnu.org/licenses/>.
15
16 """Parser of import data into command objects.
17
18 In order to reuse existing front-ends, the stream format is a subset of
19 the one used by git-fast-import (as of the 1.5.4 release of git at least).
20 The grammar is:
21
22 stream ::= cmd*;
23
24 cmd ::= new_blob
25 | new_commit
26 | new_tag
27 | reset_branch
28 | checkpoint
29 | progress
30 ;
31
32 new_blob ::= 'blob' lf
33 mark?
34 file_content;
35 file_content ::= data;
36
37 new_commit ::= 'commit' sp ref_str lf
38 mark?
39 ('author' sp name '<' email '>' when lf)?
40 'committer' sp name '<' email '>' when lf
41 commit_msg
42 ('from' sp (ref_str | hexsha1 | sha1exp_str | idnum) lf)?
43 ('merge' sp (ref_str | hexsha1 | sha1exp_str | idnum) lf)*
44 file_change*
45 lf?;
46 commit_msg ::= data;
47
48 file_change ::= file_clr
49 | file_del
50 | file_rnm
51 | file_cpy
52 | file_obm
53 | file_inm;
54 file_clr ::= 'deleteall' lf;
55 file_del ::= 'D' sp path_str lf;
56 file_rnm ::= 'R' sp path_str sp path_str lf;
57 file_cpy ::= 'C' sp path_str sp path_str lf;
58 file_obm ::= 'M' sp mode sp (hexsha1 | idnum) sp path_str lf;
59 file_inm ::= 'M' sp mode sp 'inline' sp path_str lf
60 data;
61
62 new_tag ::= 'tag' sp tag_str lf
63 'from' sp (ref_str | hexsha1 | sha1exp_str | idnum) lf
64 'tagger' sp name '<' email '>' when lf
65 tag_msg;
66 tag_msg ::= data;
67
68 reset_branch ::= 'reset' sp ref_str lf
69 ('from' sp (ref_str | hexsha1 | sha1exp_str | idnum) lf)?
70 lf?;
71
72 checkpoint ::= 'checkpoint' lf
73 lf?;
74
75 progress ::= 'progress' sp not_lf* lf
76 lf?;
77
78 # note: the first idnum in a stream should be 1 and subsequent
79 # idnums should not have gaps between values as this will cause
80 # the stream parser to reserve space for the gapped values. An
81 # idnum can be updated in the future to a new object by issuing
82 # a new mark directive with the old idnum.
83 #
84 mark ::= 'mark' sp idnum lf;
85 data ::= (delimited_data | exact_data)
86 lf?;
87
88 # note: delim may be any string but must not contain lf.
89 # data_line may contain any data but must not be exactly
90 # delim. The lf after the final data_line is included in
91 # the data.
92 delimited_data ::= 'data' sp '<<' delim lf
93 (data_line lf)*
94 delim lf;
95
96 # note: declen indicates the length of binary_data in bytes.
97 # declen does not include the lf preceeding the binary data.
98 #
99 exact_data ::= 'data' sp declen lf
100 binary_data;
101
102 # note: quoted strings are C-style quoting supporting \c for
103 # common escapes of 'c' (e..g \n, \t, \\, \") or \nnn where nnn
104 # is the signed byte value in octal. Note that the only
105 # characters which must actually be escaped to protect the
106 # stream formatting is: \, " and LF. Otherwise these values
107 # are UTF8.
108 #
109 ref_str ::= ref;
110 sha1exp_str ::= sha1exp;
111 tag_str ::= tag;
112 path_str ::= path | '"' quoted(path) '"' ;
113 mode ::= '100644' | '644'
114 | '100755' | '755'
115 | '120000'
116 ;
117
118 declen ::= # unsigned 32 bit value, ascii base10 notation;
119 bigint ::= # unsigned integer value, ascii base10 notation;
120 binary_data ::= # file content, not interpreted;
121
122 when ::= raw_when | rfc2822_when;
123 raw_when ::= ts sp tz;
124 rfc2822_when ::= # Valid RFC 2822 date and time;
125
126 sp ::= # ASCII space character;
127 lf ::= # ASCII newline (LF) character;
128
129 # note: a colon (':') must precede the numerical value assigned to
130 # an idnum. This is to distinguish it from a ref or tag name as
131 # GIT does not permit ':' in ref or tag strings.
132 #
133 idnum ::= ':' bigint;
134 path ::= # GIT style file path, e.g. "a/b/c";
135 ref ::= # GIT ref name, e.g. "refs/heads/MOZ_GECKO_EXPERIMENT";
136 tag ::= # GIT tag name, e.g. "FIREFOX_1_5";
137 sha1exp ::= # Any valid GIT SHA1 expression;
138 hexsha1 ::= # SHA1 in hexadecimal format;
139
140 # note: name and email are UTF8 strings, however name must not
141 # contain '<' or lf and email must not contain any of the
142 # following: '<', '>', lf.
143 #
144 name ::= # valid GIT author/committer name;
145 email ::= # valid GIT author/committer email;
146 ts ::= # time since the epoch in seconds, ascii base10 notation;
147 tz ::= # GIT style timezone;
148
149 # note: comments may appear anywhere in the input, except
150 # within a data command. Any form of the data command
151 # always escapes the related input from comment processing.
152 #
153 # In case it is not clear, the '#' that starts the comment
154 # must be the first character on that the line (an lf have
155 # preceeded it).
156 #
157 comment ::= '#' not_lf* lf;
158 not_lf ::= # Any byte that is not ASCII newline (LF);
159 """
160 from __future__ import print_function
161
162 import collections
163 import re
164 import sys
165 import codecs
166
167 from fastimport import (
168 commands,
169 dates,
170 errors,
171 )
172 from fastimport.helpers import (
173 newobject as object,
174 utf8_bytes_string,
175 )
176
177
178 ## Stream parsing ##
179
180 class LineBasedParser(object):
181
182 def __init__(self, input_stream):
183 """A Parser that keeps track of line numbers.
184
185 :param input: the file-like object to read from
186 """
187 self.input = input_stream
188 self.lineno = 0
189 # Lines pushed back onto the input stream
190 self._buffer = []
191
192 def abort(self, exception, *args):
193 """Raise an exception providing line number information."""
194 raise exception(self.lineno, *args)
195
196 def readline(self):
197 """Get the next line including the newline or '' on EOF."""
198 self.lineno += 1
199 if self._buffer:
200 return self._buffer.pop()
201 else:
202 return self.input.readline()
203
204 def next_line(self):
205 """Get the next line without the newline or None on EOF."""
206 line = self.readline()
207 if line:
208 return line[:-1]
209 else:
210 return None
211
212 def push_line(self, line):
213 """Push line back onto the line buffer.
214
215 :param line: the line with no trailing newline
216 """
217 self.lineno -= 1
218 self._buffer.append(line + b'\n')
219
220 def read_bytes(self, count):
221 """Read a given number of bytes from the input stream.
222
223 Throws MissingBytes if the bytes are not found.
224
225 Note: This method does not read from the line buffer.
226
227 :return: a string
228 """
229 result = self.input.read(count)
230 found = len(result)
231 self.lineno += result.count(b'\n')
232 if found != count:
233 self.abort(errors.MissingBytes, count, found)
234 return result
235
236 def read_until(self, terminator):
237 """Read the input stream until the terminator is found.
238
239 Throws MissingTerminator if the terminator is not found.
240
241 Note: This method does not read from the line buffer.
242
243 :return: the bytes read up to but excluding the terminator.
244 """
245
246 lines = []
247 term = terminator + b'\n'
248 while True:
249 line = self.input.readline()
250 if line == term:
251 break
252 else:
253 lines.append(line)
254 return b''.join(lines)
255
256
257 # Regular expression used for parsing. (Note: The spec states that the name
258 # part should be non-empty but git-fast-export doesn't always do that so
259 # the first bit is \w*, not \w+.) Also git-fast-import code says the
260 # space before the email is optional.
261 _WHO_AND_WHEN_RE = re.compile(br'([^<]*)<(.*)> (.+)')
262 _WHO_RE = re.compile(br'([^<]*)<(.*)>')
263
264
265 class ImportParser(LineBasedParser):
266
267 def __init__(self, input_stream, verbose=False, output=sys.stdout,
268 user_mapper=None, strict=True):
269 """A Parser of import commands.
270
271 :param input_stream: the file-like object to read from
272 :param verbose: display extra information of not
273 :param output: the file-like object to write messages to (YAGNI?)
274 :param user_mapper: if not None, the UserMapper used to adjust
275 user-ids for authors, committers and taggers.
276 :param strict: Raise errors on strictly invalid data
277 """
278 LineBasedParser.__init__(self, input_stream)
279 self.verbose = verbose
280 self.output = output
281 self.user_mapper = user_mapper
282 self.strict = strict
283 # We auto-detect the date format when a date is first encountered
284 self.date_parser = None
285 self.features = {}
286
287 def warning(self, msg):
288 sys.stderr.write("warning line %d: %s\n" % (self.lineno, msg))
289
290 def iter_commands(self):
291 """Iterator returning ImportCommand objects."""
292 while True:
293 line = self.next_line()
294 if line is None:
295 if b'done' in self.features:
296 raise errors.PrematureEndOfStream(self.lineno)
297 break
298 elif len(line) == 0 or line.startswith(b'#'):
299 continue
300 # Search for commands in order of likelihood
301 elif line.startswith(b'commit '):
302 yield self._parse_commit(line[len(b'commit '):])
303 elif line.startswith(b'blob'):
304 yield self._parse_blob()
305 elif line.startswith(b'done'):
306 break
307 elif line.startswith(b'progress '):
308 yield commands.ProgressCommand(line[len(b'progress '):])
309 elif line.startswith(b'reset '):
310 yield self._parse_reset(line[len(b'reset '):])
311 elif line.startswith(b'tag '):
312 yield self._parse_tag(line[len(b'tag '):])
313 elif line.startswith(b'checkpoint'):
314 yield commands.CheckpointCommand()
315 elif line.startswith(b'feature'):
316 yield self._parse_feature(line[len(b'feature '):])
317 else:
318 self.abort(errors.InvalidCommand, line)
319
320 def iter_file_commands(self):
321 """Iterator returning FileCommand objects.
322
323 If an invalid file command is found, the line is silently
324 pushed back and iteration ends.
325 """
326 while True:
327 line = self.next_line()
328 if line is None:
329 break
330 elif len(line) == 0 or line.startswith(b'#'):
331 continue
332 # Search for file commands in order of likelihood
333 elif line.startswith(b'M '):
334 yield self._parse_file_modify(line[2:])
335 elif line.startswith(b'D '):
336 path = self._path(line[2:])
337 yield commands.FileDeleteCommand(path)
338 elif line.startswith(b'R '):
339 old, new = self._path_pair(line[2:])
340 yield commands.FileRenameCommand(old, new)
341 elif line.startswith(b'C '):
342 src, dest = self._path_pair(line[2:])
343 yield commands.FileCopyCommand(src, dest)
344 elif line.startswith(b'deleteall'):
345 yield commands.FileDeleteAllCommand()
346 else:
347 self.push_line(line)
348 break
349
350 def _parse_blob(self):
351 """Parse a blob command."""
352 lineno = self.lineno
353 mark = self._get_mark_if_any()
354 data = self._get_data(b'blob')
355 return commands.BlobCommand(mark, data, lineno)
356
357 def _parse_commit(self, ref):
358 """Parse a commit command."""
359 lineno = self.lineno
360 mark = self._get_mark_if_any()
361 author = self._get_user_info(b'commit', b'author', False)
362 more_authors = []
363 while True:
364 another_author = self._get_user_info(b'commit', b'author', False)
365 if another_author is not None:
366 more_authors.append(another_author)
367 else:
368 break
369 committer = self._get_user_info(b'commit', b'committer')
370 message = self._get_data(b'commit', b'message')
371 from_ = self._get_from()
372 merges = []
373 while True:
374 merge = self._get_merge()
375 if merge is not None:
376 # while the spec suggests it's illegal, git-fast-export
377 # outputs multiple merges on the one line, e.g.
378 # merge :x :y :z
379 these_merges = merge.split(b' ')
380 merges.extend(these_merges)
381 else:
382 break
383 properties = {}
384 while True:
385 name_value = self._get_property()
386 if name_value is not None:
387 name, value = name_value
388 properties[name] = value
389 else:
390 break
391 return commands.CommitCommand(ref, mark, author, committer, message,
392 from_, merges, list(self.iter_file_commands()), lineno=lineno,
393 more_authors=more_authors, properties=properties)
394
395 def _parse_feature(self, info):
396 """Parse a feature command."""
397 parts = info.split(b'=', 1)
398 name = parts[0]
399 if len(parts) > 1:
400 value = self._path(parts[1])
401 else:
402 value = None
403 self.features[name] = value
404 return commands.FeatureCommand(name, value, lineno=self.lineno)
405
406 def _parse_file_modify(self, info):
407 """Parse a filemodify command within a commit.
408
409 :param info: a string in the format "mode dataref path"
410 (where dataref might be the hard-coded literal 'inline').
411 """
412 params = info.split(b' ', 2)
413 path = self._path(params[2])
414 mode = self._mode(params[0])
415 if params[1] == b'inline':
416 dataref = None
417 data = self._get_data(b'filemodify')
418 else:
419 dataref = params[1]
420 data = None
421 return commands.FileModifyCommand(path, mode, dataref,
422 data)
423
424 def _parse_reset(self, ref):
425 """Parse a reset command."""
426 from_ = self._get_from()
427 return commands.ResetCommand(ref, from_)
428
429 def _parse_tag(self, name):
430 """Parse a tag command."""
431 from_ = self._get_from(b'tag')
432 tagger = self._get_user_info(b'tag', b'tagger',
433 accept_just_who=True)
434 message = self._get_data(b'tag', b'message')
435 return commands.TagCommand(name, from_, tagger, message)
436
437 def _get_mark_if_any(self):
438 """Parse a mark section."""
439 line = self.next_line()
440 if line.startswith(b'mark :'):
441 return line[len(b'mark :'):]
442 else:
443 self.push_line(line)
444 return None
445
446 def _get_from(self, required_for=None):
447 """Parse a from section."""
448 line = self.next_line()
449 if line is None:
450 return None
451 elif line.startswith(b'from '):
452 return line[len(b'from '):]
453 elif required_for:
454 self.abort(errors.MissingSection, required_for, 'from')
455 else:
456 self.push_line(line)
457 return None
458
459 def _get_merge(self):
460 """Parse a merge section."""
461 line = self.next_line()
462 if line is None:
463 return None
464 elif line.startswith(b'merge '):
465 return line[len(b'merge '):]
466 else:
467 self.push_line(line)
468 return None
469
470 def _get_property(self):
471 """Parse a property section."""
472 line = self.next_line()
473 if line is None:
474 return None
475 elif line.startswith(b'property '):
476 return self._name_value(line[len(b'property '):])
477 else:
478 self.push_line(line)
479 return None
480
481 def _get_user_info(self, cmd, section, required=True,
482 accept_just_who=False):
483 """Parse a user section."""
484 line = self.next_line()
485 if line.startswith(section + b' '):
486 return self._who_when(line[len(section + b' '):], cmd, section,
487 accept_just_who=accept_just_who)
488 elif required:
489 self.abort(errors.MissingSection, cmd, section)
490 else:
491 self.push_line(line)
492 return None
493
494 def _get_data(self, required_for, section=b'data'):
495 """Parse a data section."""
496 line = self.next_line()
497 if line.startswith(b'data '):
498 rest = line[len(b'data '):]
499 if rest.startswith(b'<<'):
500 return self.read_until(rest[2:])
501 else:
502 size = int(rest)
503 read_bytes = self.read_bytes(size)
504 # optional LF after data.
505 next_line = self.input.readline()
506 self.lineno += 1
507 if len(next_line) > 1 or next_line != b'\n':
508 self.push_line(next_line[:-1])
509 return read_bytes
510 else:
511 self.abort(errors.MissingSection, required_for, section)
512
513 def _who_when(self, s, cmd, section, accept_just_who=False):
514 """Parse who and when information from a string.
515
516 :return: a tuple of (name,email,timestamp,timezone). name may be
517 the empty string if only an email address was given.
518 """
519 match = _WHO_AND_WHEN_RE.search(s)
520 if match:
521 datestr = match.group(3).lstrip()
522 if self.date_parser is None:
523 # auto-detect the date format
524 if len(datestr.split(b' ')) == 2:
525 date_format = 'raw'
526 elif datestr == b'now':
527 date_format = 'now'
528 else:
529 date_format = 'rfc2822'
530 self.date_parser = dates.DATE_PARSERS_BY_NAME[date_format]
531 try:
532 when = self.date_parser(datestr, self.lineno)
533 except ValueError:
534 print("failed to parse datestr '%s'" % (datestr,))
535 raise
536 name = match.group(1).rstrip()
537 email = match.group(2)
538 else:
539 match = _WHO_RE.search(s)
540 if accept_just_who and match:
541 # HACK around missing time
542 # TODO: output a warning here
543 when = dates.DATE_PARSERS_BY_NAME['now']('now')
544 name = match.group(1)
545 email = match.group(2)
546 elif self.strict:
547 self.abort(errors.BadFormat, cmd, section, s)
548 else:
549 name = s
550 email = None
551 when = dates.DATE_PARSERS_BY_NAME['now']('now')
552 if len(name) > 0:
553 if name.endswith(b' '):
554 name = name[:-1]
555 # While it shouldn't happen, some datasets have email addresses
556 # which contain unicode characters. See bug 338186. We sanitize
557 # the data at this level just in case.
558 if self.user_mapper:
559 name, email = self.user_mapper.map_name_and_email(name, email)
560
561 return Authorship(name, email, when[0], when[1])
562
563 def _name_value(self, s):
564 """Parse a (name,value) tuple from 'name value-length value'."""
565 parts = s.split(b' ', 2)
566 name = parts[0]
567 if len(parts) == 1:
568 value = None
569 else:
570 size = int(parts[1])
571 value = parts[2]
572 still_to_read = size - len(value)
573 if still_to_read > 0:
574 read_bytes = self.read_bytes(still_to_read)
575 value += b'\n' + read_bytes[:still_to_read - 1]
576 return (name, value)
577
578 def _path(self, s):
579 """Parse a path."""
580 if s.startswith(b'"'):
581 if not s.endswith(b'"'):
582 self.abort(errors.BadFormat, '?', '?', s)
583 else:
584 return _unquote_c_string(s[1:-1])
585 return s
586
587 def _path_pair(self, s):
588 """Parse two paths separated by a space."""
589 # TODO: handle a space in the first path
590 if s.startswith(b'"'):
591 parts = s[1:].split(b'" ', 1)
592 else:
593 parts = s.split(b' ', 1)
594 if len(parts) != 2:
595 self.abort(errors.BadFormat, '?', '?', s)
596 elif parts[1].startswith(b'"') and parts[1].endswith(b'"'):
597 parts[1] = parts[1][1:-1]
598 elif parts[1].startswith(b'"') or parts[1].endswith(b'"'):
599 self.abort(errors.BadFormat, '?', '?', s)
600 return [_unquote_c_string(s) for s in parts]
601
602 def _mode(self, s):
603 """Check file mode format and parse into an int.
604
605 :return: mode as integer
606 """
607 # Note: Output from git-fast-export slightly different to spec
608 if s in [b'644', b'100644', b'0100644']:
609 return 0o100644
610 elif s in [b'755', b'100755', b'0100755']:
611 return 0o100755
612 elif s in [b'040000', b'0040000']:
613 return 0o40000
614 elif s in [b'120000', b'0120000']:
615 return 0o120000
616 elif s in [b'160000', b'0160000']:
617 return 0o160000
618 else:
619 self.abort(errors.BadFormat, 'filemodify', 'mode', s)
620
621
622 ESCAPE_SEQUENCE_BYTES_RE = re.compile(br'''
623 ( \\U........ # 8-digit hex escapes
624 | \\u.... # 4-digit hex escapes
625 | \\x.. # 2-digit hex escapes
626 | \\[0-7]{1,3} # Octal escapes
627 | \\N\{[^}]+\} # Unicode characters by name
628 | \\[\\'"abfnrtv] # Single-character escapes
629 )''', re.VERBOSE
630 )
631
632 ESCAPE_SEQUENCE_RE = re.compile(r'''
633 ( \\U........
634 | \\u....
635 | \\x..
636 | \\[0-7]{1,3}
637 | \\N\{[^}]+\}
638 | \\[\\'"abfnrtv]
639 )''', re.UNICODE | re.VERBOSE
640 )
641
642 def _unquote_c_string(s):
643 """replace C-style escape sequences (\n, \", etc.) with real chars."""
644
645 # doing a s.encode('utf-8').decode('unicode_escape') can return an
646 # incorrect output with unicode string (both in py2 and py3) the safest way
647 # is to match the escape sequences and decoding them alone.
648 def decode_match(match):
649 return utf8_bytes_string(
650 codecs.decode(match.group(0), 'unicode-escape')
651 )
652
653 if sys.version_info[0] >= 3 and isinstance(s, bytes):
654 return ESCAPE_SEQUENCE_BYTES_RE.sub(decode_match, s)
655 else:
656 return ESCAPE_SEQUENCE_RE.sub(decode_match, s)
657
658
659 Authorship = collections.namedtuple('Authorship', 'name email timestamp timezone')