Mercurial > hg > hg-fastimport
diff hgext3rd/fastimport/vendor/python_fastimport/parser.py @ 86:28704a2a7461 vendor/python-fastimport
Import python-fastimport-0.9.8
| author | Roy Marples <roy@marples.name> |
|---|---|
| date | Tue, 19 Jan 2021 22:56:34 +0000 |
| parents | |
| children | 2fc99e3479d9 |
line wrap: on
line diff
--- /dev/null Thu Jan 01 00:00:00 1970 +0000 +++ b/hgext3rd/fastimport/vendor/python_fastimport/parser.py Tue Jan 19 22:56:34 2021 +0000 @@ -0,0 +1,659 @@ +# Copyright (C) 2008-2010 Canonical Ltd +# +# This program is free software; you can redistribute it and/or modify +# it under the terms of the GNU General Public License as published by +# the Free Software Foundation; either version 2 of the License, or +# (at your option) any later version. +# +# This program is distributed in the hope that it will be useful, +# but WITHOUT ANY WARRANTY; without even the implied warranty of +# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the +# GNU General Public License for more details. +# +# You should have received a copy of the GNU General Public License +# along with this program. If not, see <http://www.gnu.org/licenses/>. + +"""Parser of import data into command objects. + +In order to reuse existing front-ends, the stream format is a subset of +the one used by git-fast-import (as of the 1.5.4 release of git at least). +The grammar is: + + stream ::= cmd*; + + cmd ::= new_blob + | new_commit + | new_tag + | reset_branch + | checkpoint + | progress + ; + + new_blob ::= 'blob' lf + mark? + file_content; + file_content ::= data; + + new_commit ::= 'commit' sp ref_str lf + mark? + ('author' sp name '<' email '>' when lf)? + 'committer' sp name '<' email '>' when lf + commit_msg + ('from' sp (ref_str | hexsha1 | sha1exp_str | idnum) lf)? + ('merge' sp (ref_str | hexsha1 | sha1exp_str | idnum) lf)* + file_change* + lf?; + commit_msg ::= data; + + file_change ::= file_clr + | file_del + | file_rnm + | file_cpy + | file_obm + | file_inm; + file_clr ::= 'deleteall' lf; + file_del ::= 'D' sp path_str lf; + file_rnm ::= 'R' sp path_str sp path_str lf; + file_cpy ::= 'C' sp path_str sp path_str lf; + file_obm ::= 'M' sp mode sp (hexsha1 | idnum) sp path_str lf; + file_inm ::= 'M' sp mode sp 'inline' sp path_str lf + data; + + new_tag ::= 'tag' sp tag_str lf + 'from' sp (ref_str | hexsha1 | sha1exp_str | idnum) lf + 'tagger' sp name '<' email '>' when lf + tag_msg; + tag_msg ::= data; + + reset_branch ::= 'reset' sp ref_str lf + ('from' sp (ref_str | hexsha1 | sha1exp_str | idnum) lf)? + lf?; + + checkpoint ::= 'checkpoint' lf + lf?; + + progress ::= 'progress' sp not_lf* lf + lf?; + + # note: the first idnum in a stream should be 1 and subsequent + # idnums should not have gaps between values as this will cause + # the stream parser to reserve space for the gapped values. An + # idnum can be updated in the future to a new object by issuing + # a new mark directive with the old idnum. + # + mark ::= 'mark' sp idnum lf; + data ::= (delimited_data | exact_data) + lf?; + + # note: delim may be any string but must not contain lf. + # data_line may contain any data but must not be exactly + # delim. The lf after the final data_line is included in + # the data. + delimited_data ::= 'data' sp '<<' delim lf + (data_line lf)* + delim lf; + + # note: declen indicates the length of binary_data in bytes. + # declen does not include the lf preceeding the binary data. + # + exact_data ::= 'data' sp declen lf + binary_data; + + # note: quoted strings are C-style quoting supporting \c for + # common escapes of 'c' (e..g \n, \t, \\, \") or \nnn where nnn + # is the signed byte value in octal. Note that the only + # characters which must actually be escaped to protect the + # stream formatting is: \, " and LF. Otherwise these values + # are UTF8. + # + ref_str ::= ref; + sha1exp_str ::= sha1exp; + tag_str ::= tag; + path_str ::= path | '"' quoted(path) '"' ; + mode ::= '100644' | '644' + | '100755' | '755' + | '120000' + ; + + declen ::= # unsigned 32 bit value, ascii base10 notation; + bigint ::= # unsigned integer value, ascii base10 notation; + binary_data ::= # file content, not interpreted; + + when ::= raw_when | rfc2822_when; + raw_when ::= ts sp tz; + rfc2822_when ::= # Valid RFC 2822 date and time; + + sp ::= # ASCII space character; + lf ::= # ASCII newline (LF) character; + + # note: a colon (':') must precede the numerical value assigned to + # an idnum. This is to distinguish it from a ref or tag name as + # GIT does not permit ':' in ref or tag strings. + # + idnum ::= ':' bigint; + path ::= # GIT style file path, e.g. "a/b/c"; + ref ::= # GIT ref name, e.g. "refs/heads/MOZ_GECKO_EXPERIMENT"; + tag ::= # GIT tag name, e.g. "FIREFOX_1_5"; + sha1exp ::= # Any valid GIT SHA1 expression; + hexsha1 ::= # SHA1 in hexadecimal format; + + # note: name and email are UTF8 strings, however name must not + # contain '<' or lf and email must not contain any of the + # following: '<', '>', lf. + # + name ::= # valid GIT author/committer name; + email ::= # valid GIT author/committer email; + ts ::= # time since the epoch in seconds, ascii base10 notation; + tz ::= # GIT style timezone; + + # note: comments may appear anywhere in the input, except + # within a data command. Any form of the data command + # always escapes the related input from comment processing. + # + # In case it is not clear, the '#' that starts the comment + # must be the first character on that the line (an lf have + # preceeded it). + # + comment ::= '#' not_lf* lf; + not_lf ::= # Any byte that is not ASCII newline (LF); +""" +from __future__ import print_function + +import collections +import re +import sys +import codecs + +from fastimport import ( + commands, + dates, + errors, + ) +from fastimport.helpers import ( + newobject as object, + utf8_bytes_string, + ) + + +## Stream parsing ## + +class LineBasedParser(object): + + def __init__(self, input_stream): + """A Parser that keeps track of line numbers. + + :param input: the file-like object to read from + """ + self.input = input_stream + self.lineno = 0 + # Lines pushed back onto the input stream + self._buffer = [] + + def abort(self, exception, *args): + """Raise an exception providing line number information.""" + raise exception(self.lineno, *args) + + def readline(self): + """Get the next line including the newline or '' on EOF.""" + self.lineno += 1 + if self._buffer: + return self._buffer.pop() + else: + return self.input.readline() + + def next_line(self): + """Get the next line without the newline or None on EOF.""" + line = self.readline() + if line: + return line[:-1] + else: + return None + + def push_line(self, line): + """Push line back onto the line buffer. + + :param line: the line with no trailing newline + """ + self.lineno -= 1 + self._buffer.append(line + b'\n') + + def read_bytes(self, count): + """Read a given number of bytes from the input stream. + + Throws MissingBytes if the bytes are not found. + + Note: This method does not read from the line buffer. + + :return: a string + """ + result = self.input.read(count) + found = len(result) + self.lineno += result.count(b'\n') + if found != count: + self.abort(errors.MissingBytes, count, found) + return result + + def read_until(self, terminator): + """Read the input stream until the terminator is found. + + Throws MissingTerminator if the terminator is not found. + + Note: This method does not read from the line buffer. + + :return: the bytes read up to but excluding the terminator. + """ + + lines = [] + term = terminator + b'\n' + while True: + line = self.input.readline() + if line == term: + break + else: + lines.append(line) + return b''.join(lines) + + +# Regular expression used for parsing. (Note: The spec states that the name +# part should be non-empty but git-fast-export doesn't always do that so +# the first bit is \w*, not \w+.) Also git-fast-import code says the +# space before the email is optional. +_WHO_AND_WHEN_RE = re.compile(br'([^<]*)<(.*)> (.+)') +_WHO_RE = re.compile(br'([^<]*)<(.*)>') + + +class ImportParser(LineBasedParser): + + def __init__(self, input_stream, verbose=False, output=sys.stdout, + user_mapper=None, strict=True): + """A Parser of import commands. + + :param input_stream: the file-like object to read from + :param verbose: display extra information of not + :param output: the file-like object to write messages to (YAGNI?) + :param user_mapper: if not None, the UserMapper used to adjust + user-ids for authors, committers and taggers. + :param strict: Raise errors on strictly invalid data + """ + LineBasedParser.__init__(self, input_stream) + self.verbose = verbose + self.output = output + self.user_mapper = user_mapper + self.strict = strict + # We auto-detect the date format when a date is first encountered + self.date_parser = None + self.features = {} + + def warning(self, msg): + sys.stderr.write("warning line %d: %s\n" % (self.lineno, msg)) + + def iter_commands(self): + """Iterator returning ImportCommand objects.""" + while True: + line = self.next_line() + if line is None: + if b'done' in self.features: + raise errors.PrematureEndOfStream(self.lineno) + break + elif len(line) == 0 or line.startswith(b'#'): + continue + # Search for commands in order of likelihood + elif line.startswith(b'commit '): + yield self._parse_commit(line[len(b'commit '):]) + elif line.startswith(b'blob'): + yield self._parse_blob() + elif line.startswith(b'done'): + break + elif line.startswith(b'progress '): + yield commands.ProgressCommand(line[len(b'progress '):]) + elif line.startswith(b'reset '): + yield self._parse_reset(line[len(b'reset '):]) + elif line.startswith(b'tag '): + yield self._parse_tag(line[len(b'tag '):]) + elif line.startswith(b'checkpoint'): + yield commands.CheckpointCommand() + elif line.startswith(b'feature'): + yield self._parse_feature(line[len(b'feature '):]) + else: + self.abort(errors.InvalidCommand, line) + + def iter_file_commands(self): + """Iterator returning FileCommand objects. + + If an invalid file command is found, the line is silently + pushed back and iteration ends. + """ + while True: + line = self.next_line() + if line is None: + break + elif len(line) == 0 or line.startswith(b'#'): + continue + # Search for file commands in order of likelihood + elif line.startswith(b'M '): + yield self._parse_file_modify(line[2:]) + elif line.startswith(b'D '): + path = self._path(line[2:]) + yield commands.FileDeleteCommand(path) + elif line.startswith(b'R '): + old, new = self._path_pair(line[2:]) + yield commands.FileRenameCommand(old, new) + elif line.startswith(b'C '): + src, dest = self._path_pair(line[2:]) + yield commands.FileCopyCommand(src, dest) + elif line.startswith(b'deleteall'): + yield commands.FileDeleteAllCommand() + else: + self.push_line(line) + break + + def _parse_blob(self): + """Parse a blob command.""" + lineno = self.lineno + mark = self._get_mark_if_any() + data = self._get_data(b'blob') + return commands.BlobCommand(mark, data, lineno) + + def _parse_commit(self, ref): + """Parse a commit command.""" + lineno = self.lineno + mark = self._get_mark_if_any() + author = self._get_user_info(b'commit', b'author', False) + more_authors = [] + while True: + another_author = self._get_user_info(b'commit', b'author', False) + if another_author is not None: + more_authors.append(another_author) + else: + break + committer = self._get_user_info(b'commit', b'committer') + message = self._get_data(b'commit', b'message') + from_ = self._get_from() + merges = [] + while True: + merge = self._get_merge() + if merge is not None: + # while the spec suggests it's illegal, git-fast-export + # outputs multiple merges on the one line, e.g. + # merge :x :y :z + these_merges = merge.split(b' ') + merges.extend(these_merges) + else: + break + properties = {} + while True: + name_value = self._get_property() + if name_value is not None: + name, value = name_value + properties[name] = value + else: + break + return commands.CommitCommand(ref, mark, author, committer, message, + from_, merges, list(self.iter_file_commands()), lineno=lineno, + more_authors=more_authors, properties=properties) + + def _parse_feature(self, info): + """Parse a feature command.""" + parts = info.split(b'=', 1) + name = parts[0] + if len(parts) > 1: + value = self._path(parts[1]) + else: + value = None + self.features[name] = value + return commands.FeatureCommand(name, value, lineno=self.lineno) + + def _parse_file_modify(self, info): + """Parse a filemodify command within a commit. + + :param info: a string in the format "mode dataref path" + (where dataref might be the hard-coded literal 'inline'). + """ + params = info.split(b' ', 2) + path = self._path(params[2]) + mode = self._mode(params[0]) + if params[1] == b'inline': + dataref = None + data = self._get_data(b'filemodify') + else: + dataref = params[1] + data = None + return commands.FileModifyCommand(path, mode, dataref, + data) + + def _parse_reset(self, ref): + """Parse a reset command.""" + from_ = self._get_from() + return commands.ResetCommand(ref, from_) + + def _parse_tag(self, name): + """Parse a tag command.""" + from_ = self._get_from(b'tag') + tagger = self._get_user_info(b'tag', b'tagger', + accept_just_who=True) + message = self._get_data(b'tag', b'message') + return commands.TagCommand(name, from_, tagger, message) + + def _get_mark_if_any(self): + """Parse a mark section.""" + line = self.next_line() + if line.startswith(b'mark :'): + return line[len(b'mark :'):] + else: + self.push_line(line) + return None + + def _get_from(self, required_for=None): + """Parse a from section.""" + line = self.next_line() + if line is None: + return None + elif line.startswith(b'from '): + return line[len(b'from '):] + elif required_for: + self.abort(errors.MissingSection, required_for, 'from') + else: + self.push_line(line) + return None + + def _get_merge(self): + """Parse a merge section.""" + line = self.next_line() + if line is None: + return None + elif line.startswith(b'merge '): + return line[len(b'merge '):] + else: + self.push_line(line) + return None + + def _get_property(self): + """Parse a property section.""" + line = self.next_line() + if line is None: + return None + elif line.startswith(b'property '): + return self._name_value(line[len(b'property '):]) + else: + self.push_line(line) + return None + + def _get_user_info(self, cmd, section, required=True, + accept_just_who=False): + """Parse a user section.""" + line = self.next_line() + if line.startswith(section + b' '): + return self._who_when(line[len(section + b' '):], cmd, section, + accept_just_who=accept_just_who) + elif required: + self.abort(errors.MissingSection, cmd, section) + else: + self.push_line(line) + return None + + def _get_data(self, required_for, section=b'data'): + """Parse a data section.""" + line = self.next_line() + if line.startswith(b'data '): + rest = line[len(b'data '):] + if rest.startswith(b'<<'): + return self.read_until(rest[2:]) + else: + size = int(rest) + read_bytes = self.read_bytes(size) + # optional LF after data. + next_line = self.input.readline() + self.lineno += 1 + if len(next_line) > 1 or next_line != b'\n': + self.push_line(next_line[:-1]) + return read_bytes + else: + self.abort(errors.MissingSection, required_for, section) + + def _who_when(self, s, cmd, section, accept_just_who=False): + """Parse who and when information from a string. + + :return: a tuple of (name,email,timestamp,timezone). name may be + the empty string if only an email address was given. + """ + match = _WHO_AND_WHEN_RE.search(s) + if match: + datestr = match.group(3).lstrip() + if self.date_parser is None: + # auto-detect the date format + if len(datestr.split(b' ')) == 2: + date_format = 'raw' + elif datestr == b'now': + date_format = 'now' + else: + date_format = 'rfc2822' + self.date_parser = dates.DATE_PARSERS_BY_NAME[date_format] + try: + when = self.date_parser(datestr, self.lineno) + except ValueError: + print("failed to parse datestr '%s'" % (datestr,)) + raise + name = match.group(1).rstrip() + email = match.group(2) + else: + match = _WHO_RE.search(s) + if accept_just_who and match: + # HACK around missing time + # TODO: output a warning here + when = dates.DATE_PARSERS_BY_NAME['now']('now') + name = match.group(1) + email = match.group(2) + elif self.strict: + self.abort(errors.BadFormat, cmd, section, s) + else: + name = s + email = None + when = dates.DATE_PARSERS_BY_NAME['now']('now') + if len(name) > 0: + if name.endswith(b' '): + name = name[:-1] + # While it shouldn't happen, some datasets have email addresses + # which contain unicode characters. See bug 338186. We sanitize + # the data at this level just in case. + if self.user_mapper: + name, email = self.user_mapper.map_name_and_email(name, email) + + return Authorship(name, email, when[0], when[1]) + + def _name_value(self, s): + """Parse a (name,value) tuple from 'name value-length value'.""" + parts = s.split(b' ', 2) + name = parts[0] + if len(parts) == 1: + value = None + else: + size = int(parts[1]) + value = parts[2] + still_to_read = size - len(value) + if still_to_read > 0: + read_bytes = self.read_bytes(still_to_read) + value += b'\n' + read_bytes[:still_to_read - 1] + return (name, value) + + def _path(self, s): + """Parse a path.""" + if s.startswith(b'"'): + if not s.endswith(b'"'): + self.abort(errors.BadFormat, '?', '?', s) + else: + return _unquote_c_string(s[1:-1]) + return s + + def _path_pair(self, s): + """Parse two paths separated by a space.""" + # TODO: handle a space in the first path + if s.startswith(b'"'): + parts = s[1:].split(b'" ', 1) + else: + parts = s.split(b' ', 1) + if len(parts) != 2: + self.abort(errors.BadFormat, '?', '?', s) + elif parts[1].startswith(b'"') and parts[1].endswith(b'"'): + parts[1] = parts[1][1:-1] + elif parts[1].startswith(b'"') or parts[1].endswith(b'"'): + self.abort(errors.BadFormat, '?', '?', s) + return [_unquote_c_string(s) for s in parts] + + def _mode(self, s): + """Check file mode format and parse into an int. + + :return: mode as integer + """ + # Note: Output from git-fast-export slightly different to spec + if s in [b'644', b'100644', b'0100644']: + return 0o100644 + elif s in [b'755', b'100755', b'0100755']: + return 0o100755 + elif s in [b'040000', b'0040000']: + return 0o40000 + elif s in [b'120000', b'0120000']: + return 0o120000 + elif s in [b'160000', b'0160000']: + return 0o160000 + else: + self.abort(errors.BadFormat, 'filemodify', 'mode', s) + + +ESCAPE_SEQUENCE_BYTES_RE = re.compile(br''' + ( \\U........ # 8-digit hex escapes + | \\u.... # 4-digit hex escapes + | \\x.. # 2-digit hex escapes + | \\[0-7]{1,3} # Octal escapes + | \\N\{[^}]+\} # Unicode characters by name + | \\[\\'"abfnrtv] # Single-character escapes + )''', re.VERBOSE +) + +ESCAPE_SEQUENCE_RE = re.compile(r''' + ( \\U........ + | \\u.... + | \\x.. + | \\[0-7]{1,3} + | \\N\{[^}]+\} + | \\[\\'"abfnrtv] + )''', re.UNICODE | re.VERBOSE +) + +def _unquote_c_string(s): + """replace C-style escape sequences (\n, \", etc.) with real chars.""" + + # doing a s.encode('utf-8').decode('unicode_escape') can return an + # incorrect output with unicode string (both in py2 and py3) the safest way + # is to match the escape sequences and decoding them alone. + def decode_match(match): + return utf8_bytes_string( + codecs.decode(match.group(0), 'unicode-escape') + ) + + if sys.version_info[0] >= 3 and isinstance(s, bytes): + return ESCAPE_SEQUENCE_BYTES_RE.sub(decode_match, s) + else: + return ESCAPE_SEQUENCE_RE.sub(decode_match, s) + + +Authorship = collections.namedtuple('Authorship', 'name email timestamp timezone')
