diff hgext3rd/fastimport/vendor/python_fastimport/parser.py @ 86:28704a2a7461 vendor/python-fastimport

Import python-fastimport-0.9.8
author Roy Marples <roy@marples.name>
date Tue, 19 Jan 2021 22:56:34 +0000
parents
children 2fc99e3479d9
line wrap: on
line diff
--- /dev/null	Thu Jan 01 00:00:00 1970 +0000
+++ b/hgext3rd/fastimport/vendor/python_fastimport/parser.py	Tue Jan 19 22:56:34 2021 +0000
@@ -0,0 +1,659 @@
+# Copyright (C) 2008-2010 Canonical Ltd
+#
+# This program is free software; you can redistribute it and/or modify
+# it under the terms of the GNU General Public License as published by
+# the Free Software Foundation; either version 2 of the License, or
+# (at your option) any later version.
+#
+# This program is distributed in the hope that it will be useful,
+# but WITHOUT ANY WARRANTY; without even the implied warranty of
+# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+# GNU General Public License for more details.
+#
+# You should have received a copy of the GNU General Public License
+# along with this program.  If not, see <http://www.gnu.org/licenses/>.
+
+"""Parser of import data into command objects.
+
+In order to reuse existing front-ends, the stream format is a subset of
+the one used by git-fast-import (as of the 1.5.4 release of git at least).
+The grammar is:
+
+  stream ::= cmd*;
+
+  cmd ::= new_blob
+        | new_commit
+        | new_tag
+        | reset_branch
+        | checkpoint
+        | progress
+        ;
+
+  new_blob ::= 'blob' lf
+    mark?
+    file_content;
+  file_content ::= data;
+
+  new_commit ::= 'commit' sp ref_str lf
+    mark?
+    ('author' sp name '<' email '>' when lf)?
+    'committer' sp name '<' email '>' when lf
+    commit_msg
+    ('from' sp (ref_str | hexsha1 | sha1exp_str | idnum) lf)?
+    ('merge' sp (ref_str | hexsha1 | sha1exp_str | idnum) lf)*
+    file_change*
+    lf?;
+  commit_msg ::= data;
+
+  file_change ::= file_clr
+    | file_del
+    | file_rnm
+    | file_cpy
+    | file_obm
+    | file_inm;
+  file_clr ::= 'deleteall' lf;
+  file_del ::= 'D' sp path_str lf;
+  file_rnm ::= 'R' sp path_str sp path_str lf;
+  file_cpy ::= 'C' sp path_str sp path_str lf;
+  file_obm ::= 'M' sp mode sp (hexsha1 | idnum) sp path_str lf;
+  file_inm ::= 'M' sp mode sp 'inline' sp path_str lf
+    data;
+
+  new_tag ::= 'tag' sp tag_str lf
+    'from' sp (ref_str | hexsha1 | sha1exp_str | idnum) lf
+    'tagger' sp name '<' email '>' when lf
+    tag_msg;
+  tag_msg ::= data;
+
+  reset_branch ::= 'reset' sp ref_str lf
+    ('from' sp (ref_str | hexsha1 | sha1exp_str | idnum) lf)?
+    lf?;
+
+  checkpoint ::= 'checkpoint' lf
+    lf?;
+
+  progress ::= 'progress' sp not_lf* lf
+    lf?;
+
+     # note: the first idnum in a stream should be 1 and subsequent
+     # idnums should not have gaps between values as this will cause
+     # the stream parser to reserve space for the gapped values.  An
+     # idnum can be updated in the future to a new object by issuing
+     # a new mark directive with the old idnum.
+     #
+  mark ::= 'mark' sp idnum lf;
+  data ::= (delimited_data | exact_data)
+    lf?;
+
+    # note: delim may be any string but must not contain lf.
+    # data_line may contain any data but must not be exactly
+    # delim. The lf after the final data_line is included in
+    # the data.
+  delimited_data ::= 'data' sp '<<' delim lf
+    (data_line lf)*
+    delim lf;
+
+     # note: declen indicates the length of binary_data in bytes.
+     # declen does not include the lf preceeding the binary data.
+     #
+  exact_data ::= 'data' sp declen lf
+    binary_data;
+
+     # note: quoted strings are C-style quoting supporting \c for
+     # common escapes of 'c' (e..g \n, \t, \\, \") or \nnn where nnn
+     # is the signed byte value in octal.  Note that the only
+     # characters which must actually be escaped to protect the
+     # stream formatting is: \, " and LF.  Otherwise these values
+     # are UTF8.
+     #
+  ref_str     ::= ref;
+  sha1exp_str ::= sha1exp;
+  tag_str     ::= tag;
+  path_str    ::= path    | '"' quoted(path)    '"' ;
+  mode        ::= '100644' | '644'
+                | '100755' | '755'
+                | '120000'
+                ;
+
+  declen ::= # unsigned 32 bit value, ascii base10 notation;
+  bigint ::= # unsigned integer value, ascii base10 notation;
+  binary_data ::= # file content, not interpreted;
+
+  when         ::= raw_when | rfc2822_when;
+  raw_when     ::= ts sp tz;
+  rfc2822_when ::= # Valid RFC 2822 date and time;
+
+  sp ::= # ASCII space character;
+  lf ::= # ASCII newline (LF) character;
+
+     # note: a colon (':') must precede the numerical value assigned to
+     # an idnum.  This is to distinguish it from a ref or tag name as
+     # GIT does not permit ':' in ref or tag strings.
+     #
+  idnum   ::= ':' bigint;
+  path    ::= # GIT style file path, e.g. "a/b/c";
+  ref     ::= # GIT ref name, e.g. "refs/heads/MOZ_GECKO_EXPERIMENT";
+  tag     ::= # GIT tag name, e.g. "FIREFOX_1_5";
+  sha1exp ::= # Any valid GIT SHA1 expression;
+  hexsha1 ::= # SHA1 in hexadecimal format;
+
+     # note: name and email are UTF8 strings, however name must not
+     # contain '<' or lf and email must not contain any of the
+     # following: '<', '>', lf.
+     #
+  name  ::= # valid GIT author/committer name;
+  email ::= # valid GIT author/committer email;
+  ts    ::= # time since the epoch in seconds, ascii base10 notation;
+  tz    ::= # GIT style timezone;
+
+     # note: comments may appear anywhere in the input, except
+     # within a data command.  Any form of the data command
+     # always escapes the related input from comment processing.
+     #
+     # In case it is not clear, the '#' that starts the comment
+     # must be the first character on that the line (an lf have
+     # preceeded it).
+     #
+  comment ::= '#' not_lf* lf;
+  not_lf  ::= # Any byte that is not ASCII newline (LF);
+"""
+from __future__ import print_function
+
+import collections
+import re
+import sys
+import codecs
+
+from fastimport import (
+    commands,
+    dates,
+    errors,
+    )
+from fastimport.helpers import (
+    newobject as object,
+    utf8_bytes_string,
+    )
+
+
+## Stream parsing ##
+
+class LineBasedParser(object):
+
+    def __init__(self, input_stream):
+        """A Parser that keeps track of line numbers.
+
+        :param input: the file-like object to read from
+        """
+        self.input = input_stream
+        self.lineno = 0
+        # Lines pushed back onto the input stream
+        self._buffer = []
+
+    def abort(self, exception, *args):
+        """Raise an exception providing line number information."""
+        raise exception(self.lineno, *args)
+
+    def readline(self):
+        """Get the next line including the newline or '' on EOF."""
+        self.lineno += 1
+        if self._buffer:
+            return self._buffer.pop()
+        else:
+            return self.input.readline()
+
+    def next_line(self):
+        """Get the next line without the newline or None on EOF."""
+        line = self.readline()
+        if line:
+            return line[:-1]
+        else:
+            return None
+
+    def push_line(self, line):
+        """Push line back onto the line buffer.
+
+        :param line: the line with no trailing newline
+        """
+        self.lineno -= 1
+        self._buffer.append(line + b'\n')
+
+    def read_bytes(self, count):
+        """Read a given number of bytes from the input stream.
+
+        Throws MissingBytes if the bytes are not found.
+
+        Note: This method does not read from the line buffer.
+
+        :return: a string
+        """
+        result = self.input.read(count)
+        found = len(result)
+        self.lineno += result.count(b'\n')
+        if found != count:
+            self.abort(errors.MissingBytes, count, found)
+        return result
+
+    def read_until(self, terminator):
+        """Read the input stream until the terminator is found.
+
+        Throws MissingTerminator if the terminator is not found.
+
+        Note: This method does not read from the line buffer.
+
+        :return: the bytes read up to but excluding the terminator.
+        """
+
+        lines = []
+        term = terminator + b'\n'
+        while True:
+            line = self.input.readline()
+            if line == term:
+                break
+            else:
+                lines.append(line)
+        return b''.join(lines)
+
+
+# Regular expression used for parsing. (Note: The spec states that the name
+# part should be non-empty but git-fast-export doesn't always do that so
+# the first bit is \w*, not \w+.) Also git-fast-import code says the
+# space before the email is optional.
+_WHO_AND_WHEN_RE = re.compile(br'([^<]*)<(.*)> (.+)')
+_WHO_RE = re.compile(br'([^<]*)<(.*)>')
+
+
+class ImportParser(LineBasedParser):
+
+    def __init__(self, input_stream, verbose=False, output=sys.stdout,
+        user_mapper=None, strict=True):
+        """A Parser of import commands.
+
+        :param input_stream: the file-like object to read from
+        :param verbose: display extra information of not
+        :param output: the file-like object to write messages to (YAGNI?)
+        :param user_mapper: if not None, the UserMapper used to adjust
+          user-ids for authors, committers and taggers.
+        :param strict: Raise errors on strictly invalid data
+        """
+        LineBasedParser.__init__(self, input_stream)
+        self.verbose = verbose
+        self.output = output
+        self.user_mapper = user_mapper
+        self.strict = strict
+        # We auto-detect the date format when a date is first encountered
+        self.date_parser = None
+        self.features = {}
+
+    def warning(self, msg):
+        sys.stderr.write("warning line %d: %s\n" % (self.lineno, msg))
+
+    def iter_commands(self):
+        """Iterator returning ImportCommand objects."""
+        while True:
+            line = self.next_line()
+            if line is None:
+                if b'done' in self.features:
+                    raise errors.PrematureEndOfStream(self.lineno)
+                break
+            elif len(line) == 0 or line.startswith(b'#'):
+                continue
+            # Search for commands in order of likelihood
+            elif line.startswith(b'commit '):
+                yield self._parse_commit(line[len(b'commit '):])
+            elif line.startswith(b'blob'):
+                yield self._parse_blob()
+            elif line.startswith(b'done'):
+                break
+            elif line.startswith(b'progress '):
+                yield commands.ProgressCommand(line[len(b'progress '):])
+            elif line.startswith(b'reset '):
+                yield self._parse_reset(line[len(b'reset '):])
+            elif line.startswith(b'tag '):
+                yield self._parse_tag(line[len(b'tag '):])
+            elif line.startswith(b'checkpoint'):
+                yield commands.CheckpointCommand()
+            elif line.startswith(b'feature'):
+                yield self._parse_feature(line[len(b'feature '):])
+            else:
+                self.abort(errors.InvalidCommand, line)
+
+    def iter_file_commands(self):
+        """Iterator returning FileCommand objects.
+
+        If an invalid file command is found, the line is silently
+        pushed back and iteration ends.
+        """
+        while True:
+            line = self.next_line()
+            if line is None:
+                break
+            elif len(line) == 0 or line.startswith(b'#'):
+                continue
+            # Search for file commands in order of likelihood
+            elif line.startswith(b'M '):
+                yield self._parse_file_modify(line[2:])
+            elif line.startswith(b'D '):
+                path = self._path(line[2:])
+                yield commands.FileDeleteCommand(path)
+            elif line.startswith(b'R '):
+                old, new = self._path_pair(line[2:])
+                yield commands.FileRenameCommand(old, new)
+            elif line.startswith(b'C '):
+                src, dest = self._path_pair(line[2:])
+                yield commands.FileCopyCommand(src, dest)
+            elif line.startswith(b'deleteall'):
+                yield commands.FileDeleteAllCommand()
+            else:
+                self.push_line(line)
+                break
+
+    def _parse_blob(self):
+        """Parse a blob command."""
+        lineno = self.lineno
+        mark = self._get_mark_if_any()
+        data = self._get_data(b'blob')
+        return commands.BlobCommand(mark, data, lineno)
+
+    def _parse_commit(self, ref):
+        """Parse a commit command."""
+        lineno  = self.lineno
+        mark = self._get_mark_if_any()
+        author = self._get_user_info(b'commit', b'author', False)
+        more_authors = []
+        while True:
+            another_author = self._get_user_info(b'commit', b'author', False)
+            if another_author is not None:
+                more_authors.append(another_author)
+            else:
+                break
+        committer = self._get_user_info(b'commit', b'committer')
+        message = self._get_data(b'commit', b'message')
+        from_ = self._get_from()
+        merges = []
+        while True:
+            merge = self._get_merge()
+            if merge is not None:
+                # while the spec suggests it's illegal, git-fast-export
+                # outputs multiple merges on the one line, e.g.
+                # merge :x :y :z
+                these_merges = merge.split(b' ')
+                merges.extend(these_merges)
+            else:
+                break
+        properties = {}
+        while True:
+            name_value = self._get_property()
+            if name_value is not None:
+                name, value = name_value
+                properties[name] = value
+            else:
+                break
+        return commands.CommitCommand(ref, mark, author, committer, message,
+            from_, merges, list(self.iter_file_commands()), lineno=lineno,
+            more_authors=more_authors, properties=properties)
+
+    def _parse_feature(self, info):
+        """Parse a feature command."""
+        parts = info.split(b'=', 1)
+        name = parts[0]
+        if len(parts) > 1:
+            value = self._path(parts[1])
+        else:
+            value = None
+        self.features[name] = value
+        return commands.FeatureCommand(name, value, lineno=self.lineno)
+
+    def _parse_file_modify(self, info):
+        """Parse a filemodify command within a commit.
+
+        :param info: a string in the format "mode dataref path"
+          (where dataref might be the hard-coded literal 'inline').
+        """
+        params = info.split(b' ', 2)
+        path = self._path(params[2])
+        mode = self._mode(params[0])
+        if params[1] == b'inline':
+            dataref = None
+            data = self._get_data(b'filemodify')
+        else:
+            dataref = params[1]
+            data = None
+        return commands.FileModifyCommand(path, mode, dataref,
+            data)
+
+    def _parse_reset(self, ref):
+        """Parse a reset command."""
+        from_ = self._get_from()
+        return commands.ResetCommand(ref, from_)
+
+    def _parse_tag(self, name):
+        """Parse a tag command."""
+        from_ = self._get_from(b'tag')
+        tagger = self._get_user_info(b'tag', b'tagger',
+                accept_just_who=True)
+        message = self._get_data(b'tag', b'message')
+        return commands.TagCommand(name, from_, tagger, message)
+
+    def _get_mark_if_any(self):
+        """Parse a mark section."""
+        line = self.next_line()
+        if line.startswith(b'mark :'):
+            return line[len(b'mark :'):]
+        else:
+            self.push_line(line)
+            return None
+
+    def _get_from(self, required_for=None):
+        """Parse a from section."""
+        line = self.next_line()
+        if line is None:
+            return None
+        elif line.startswith(b'from '):
+            return line[len(b'from '):]
+        elif required_for:
+            self.abort(errors.MissingSection, required_for, 'from')
+        else:
+            self.push_line(line)
+            return None
+
+    def _get_merge(self):
+        """Parse a merge section."""
+        line = self.next_line()
+        if line is None:
+            return None
+        elif line.startswith(b'merge '):
+            return line[len(b'merge '):]
+        else:
+            self.push_line(line)
+            return None
+
+    def _get_property(self):
+        """Parse a property section."""
+        line = self.next_line()
+        if line is None:
+            return None
+        elif line.startswith(b'property '):
+            return self._name_value(line[len(b'property '):])
+        else:
+            self.push_line(line)
+            return None
+
+    def _get_user_info(self, cmd, section, required=True,
+        accept_just_who=False):
+        """Parse a user section."""
+        line = self.next_line()
+        if line.startswith(section + b' '):
+            return self._who_when(line[len(section + b' '):], cmd, section,
+                accept_just_who=accept_just_who)
+        elif required:
+            self.abort(errors.MissingSection, cmd, section)
+        else:
+            self.push_line(line)
+            return None
+
+    def _get_data(self, required_for, section=b'data'):
+        """Parse a data section."""
+        line = self.next_line()
+        if line.startswith(b'data '):
+            rest = line[len(b'data '):]
+            if rest.startswith(b'<<'):
+                return self.read_until(rest[2:])
+            else:
+                size = int(rest)
+                read_bytes = self.read_bytes(size)
+                # optional LF after data.
+                next_line = self.input.readline()
+                self.lineno += 1
+                if len(next_line) > 1 or next_line != b'\n':
+                    self.push_line(next_line[:-1])
+                return read_bytes
+        else:
+            self.abort(errors.MissingSection, required_for, section)
+
+    def _who_when(self, s, cmd, section, accept_just_who=False):
+        """Parse who and when information from a string.
+
+        :return: a tuple of (name,email,timestamp,timezone). name may be
+            the empty string if only an email address was given.
+        """
+        match = _WHO_AND_WHEN_RE.search(s)
+        if match:
+            datestr = match.group(3).lstrip()
+            if self.date_parser is None:
+                # auto-detect the date format
+                if len(datestr.split(b' ')) == 2:
+                    date_format = 'raw'
+                elif datestr == b'now':
+                    date_format = 'now'
+                else:
+                    date_format = 'rfc2822'
+                self.date_parser = dates.DATE_PARSERS_BY_NAME[date_format]
+            try:
+                when = self.date_parser(datestr, self.lineno)
+            except ValueError:
+                print("failed to parse datestr '%s'" % (datestr,))
+                raise
+            name = match.group(1).rstrip()
+            email = match.group(2)
+        else:
+            match = _WHO_RE.search(s)
+            if accept_just_who and match:
+                # HACK around missing time
+                # TODO: output a warning here
+                when = dates.DATE_PARSERS_BY_NAME['now']('now')
+                name = match.group(1)
+                email = match.group(2)
+            elif self.strict:
+                self.abort(errors.BadFormat, cmd, section, s)
+            else:
+                name = s
+                email = None
+                when = dates.DATE_PARSERS_BY_NAME['now']('now')
+        if len(name) > 0:
+            if name.endswith(b' '):
+                name = name[:-1]
+        # While it shouldn't happen, some datasets have email addresses
+        # which contain unicode characters. See bug 338186. We sanitize
+        # the data at this level just in case.
+        if self.user_mapper:
+            name, email = self.user_mapper.map_name_and_email(name, email)
+
+        return Authorship(name, email, when[0], when[1])
+
+    def _name_value(self, s):
+        """Parse a (name,value) tuple from 'name value-length value'."""
+        parts = s.split(b' ', 2)
+        name = parts[0]
+        if len(parts) == 1:
+            value = None
+        else:
+            size = int(parts[1])
+            value = parts[2]
+            still_to_read = size - len(value)
+            if still_to_read > 0:
+                read_bytes = self.read_bytes(still_to_read)
+                value += b'\n' + read_bytes[:still_to_read - 1]
+        return (name, value)
+
+    def _path(self, s):
+        """Parse a path."""
+        if s.startswith(b'"'):
+            if not s.endswith(b'"'):
+                self.abort(errors.BadFormat, '?', '?', s)
+            else:
+                return _unquote_c_string(s[1:-1])
+        return s
+
+    def _path_pair(self, s):
+        """Parse two paths separated by a space."""
+        # TODO: handle a space in the first path
+        if s.startswith(b'"'):
+            parts = s[1:].split(b'" ', 1)
+        else:
+            parts = s.split(b' ', 1)
+        if len(parts) != 2:
+            self.abort(errors.BadFormat, '?', '?', s)
+        elif parts[1].startswith(b'"') and parts[1].endswith(b'"'):
+            parts[1] = parts[1][1:-1]
+        elif parts[1].startswith(b'"') or parts[1].endswith(b'"'):
+            self.abort(errors.BadFormat, '?', '?', s)
+        return [_unquote_c_string(s) for s in parts]
+
+    def _mode(self, s):
+        """Check file mode format and parse into an int.
+
+        :return: mode as integer
+        """
+        # Note: Output from git-fast-export slightly different to spec
+        if s in [b'644', b'100644', b'0100644']:
+            return 0o100644
+        elif s in [b'755', b'100755', b'0100755']:
+            return 0o100755
+        elif s in [b'040000', b'0040000']:
+            return 0o40000
+        elif s in [b'120000', b'0120000']:
+            return 0o120000
+        elif s in [b'160000', b'0160000']:
+            return 0o160000
+        else:
+            self.abort(errors.BadFormat, 'filemodify', 'mode', s)
+
+
+ESCAPE_SEQUENCE_BYTES_RE = re.compile(br'''
+    ( \\U........      # 8-digit hex escapes
+    | \\u....          # 4-digit hex escapes
+    | \\x..            # 2-digit hex escapes
+    | \\[0-7]{1,3}     # Octal escapes
+    | \\N\{[^}]+\}     # Unicode characters by name
+    | \\[\\'"abfnrtv]  # Single-character escapes
+    )''', re.VERBOSE
+)
+
+ESCAPE_SEQUENCE_RE = re.compile(r'''
+    ( \\U........
+    | \\u....
+    | \\x..
+    | \\[0-7]{1,3}
+    | \\N\{[^}]+\}
+    | \\[\\'"abfnrtv]
+    )''', re.UNICODE | re.VERBOSE
+)
+
+def _unquote_c_string(s):
+     """replace C-style escape sequences (\n, \", etc.) with real chars."""
+
+     # doing a s.encode('utf-8').decode('unicode_escape') can return an
+     # incorrect output with unicode string (both in py2 and py3) the safest way
+     # is to match the escape sequences and decoding them alone.
+     def decode_match(match):
+          return utf8_bytes_string(
+               codecs.decode(match.group(0), 'unicode-escape')
+          )
+
+     if sys.version_info[0] >= 3 and isinstance(s, bytes):
+          return ESCAPE_SEQUENCE_BYTES_RE.sub(decode_match, s)
+     else:
+          return ESCAPE_SEQUENCE_RE.sub(decode_match, s)
+
+
+Authorship = collections.namedtuple('Authorship', 'name email timestamp timezone')