diff hgext3rd/fastimport/hgimport.py @ 79:59a9e4d0aa72

Move hgfastimport directory to hgext3rd/fastimport This will allow hgrc to enable this extension like so: [extensions] fastimport =
author Roy Marples <roy@marples.name>
date Mon, 18 Jan 2021 23:04:05 +0000
parents hgfastimport/hgimport.py@a4f13dc5e3f7
children e6602cc471d5
line wrap: on
line diff
--- /dev/null	Thu Jan 01 00:00:00 1970 +0000
+++ b/hgext3rd/fastimport/hgimport.py	Mon Jan 18 23:04:05 2021 +0000
@@ -0,0 +1,437 @@
+# Copyright (C) 2008 Canonical Ltd
+#
+# This program is free software; you can redistribute it and/or modify
+# it under the terms of the GNU General Public License as published by
+# the Free Software Foundation; either version 2 of the License, or
+# (at your option) any later version.
+#
+# This program is distributed in the hope that it will be useful,
+# but WITHOUT ANY WARRANTY; without even the implied warranty of
+# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+# GNU General Public License for more details.
+#
+# You should have received a copy of the GNU General Public License
+# along with this program; if not, write to the Free Software
+# Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA  02111-1307  USA
+
+"""Processor of import commands.
+
+This module provides core processing functionality including an abstract class
+for basing real processors on. See the processors package for examples.
+"""
+
+import os
+import shutil
+import stat
+import sys
+
+from hgext.convert import common, hg as converthg
+from mercurial import util
+from mercurial.i18n import _
+
+from fastimport import processor, parser
+
+
+class fastimport_source(common.converter_source):
+    """Interface between the fastimport processor below and Mercurial's
+    normal conversion infrastructure.
+    """
+    def __init__(self, ui, repotype, repo, sources):
+        self.ui = ui
+        self.sources = sources
+        self.processor = HgImportProcessor(ui, repo)
+        self.parsed = False
+        self.repotype = repotype
+
+    # converter_source methods
+
+    def before(self):
+        self.processor.setup()
+
+    def after(self):
+        self.processor.teardown()
+
+    def getheads(self):
+        """Return a list of this repository's heads"""
+        self._parse()
+        allheads = []
+        for branchheads in self.processor.branchmap.values():
+            allheads.extend(branchheads)
+        return allheads
+
+    def getfile(self, name, fileid):
+        if fileid is None:              # deleted file
+            return None, None
+        return (self.processor.getblob(fileid),
+                self.processor.getmode(name, fileid))
+
+    def getchanges(self, commitid, full):
+        """Returns a tuple of (files, copies, cleanp2).
+
+        files is a sorted list of (filename, id) tuples for all files
+        changed between commitid and its first parent returned by
+        getcommit().
+        commitid id is the source revision id of the file.
+        cleanp2 is currently unused and an empty set is returned.
+
+        copies is a dictionary of dest: source
+        """
+        if full:
+            raise util.Abort(_("convert from fastimport does not support --full"))
+        return (self.processor.modified[commitid],
+                self.processor.copies[commitid],
+                set())
+
+    def getcommit(self, commitid):
+        """Return the commit object for commitid"""
+        if commitid is None:
+            return None
+        else:
+            return self.processor.commitmap[commitid]
+
+    def gettags(self):
+        """Return the tags as a dictionary of name: revision"""
+        # oops, this loses order
+        return dict(self.processor.tags)
+    
+    def getchangedfiles(self, rev, i):
+        """Return the files changed by rev compared to parent[i].
+
+        i is an index selecting one of the parents of rev.  The return
+        value should be the list of files that are different in rev and
+        this parent.
+
+        If rev has no parents, i is None.
+
+        This function is only needed to support --filemap
+        """
+        raise NotImplementedError()
+
+    # private worker methods
+
+    def _parse(self):
+        if self.parsed:
+            return
+        for source in self.sources:
+            if source == b"-":
+                infile = sys.stdin
+            else:
+                infile = open(source, 'rb')
+            try:
+                p = parser.ImportParser(infile)
+                self.processor.process(p.iter_commands)
+            finally:
+                if infile is not sys.stdin:
+                    infile.close()
+        self.parsed = True
+
+
+class HgImportProcessor(processor.ImportProcessor):
+    
+    tagprefix = b"refs/tags/"
+
+    def __init__(self, ui, repo):
+        super(HgImportProcessor, self).__init__()
+        self.ui = ui
+        self.repo = repo
+
+        self.commitmap = {}             # map commit ID (":1") to commit object
+        self.branchmap = {}             # map branch name to list of heads
+
+        # see HgImportCommitHandler for details on these three
+        self.modified = {}              # map commit id to list of file mods
+        self.filemodes = {}             # map commit id to {filename: mode} map
+        self.copies = {}                # map commit id to dict of file copies
+
+        self.tags = []                  # list of (tag, mark) tuples
+
+        self.numblobs = 0               # for progress reporting
+        self.blobdir = None
+
+    def setup(self):
+        """Setup before processing any streams."""
+        pass
+
+    def teardown(self):
+        """Cleanup after processing all streams."""
+        if self.blobdir and os.path.exists(self.blobdir):
+            self.ui.debug(b"Removing blob dir %s ...\n" % self.blobdir)
+            shutil.rmtree(self.blobdir)
+
+    def progress_handler(self, cmd):
+        self.ui.write(b"Progress: %s\n" % cmd.message)
+
+    def blob_handler(self, cmd):
+        self.writeblob(cmd.id, cmd.data)
+
+    def _getblobfilename(self, blobid):
+        if self.blobdir is None:
+            raise RuntimeError("no blobs seen, so no blob directory created")
+        # XXX should escape ":" for windows
+        return os.path.join(self.blobdir, b"blob-" + blobid)
+
+    def getblob(self, fileid):
+        (commitid, blobid) = fileid
+        f = open(self._getblobfilename(blobid), "rb")
+        try:
+            return f.read()
+        finally:
+            f.close()
+
+    def writeblob(self, blobid, data):
+        if self.blobdir is None:        # no blobs seen yet
+            self.blobdir = os.path.join(self.repo.root, b".hg", b"blobs")
+            os.mkdir(self.blobdir)
+
+        fn = self._getblobfilename(blobid)
+        blobfile = open(fn, "wb")
+        #self.ui.debug("writing blob %s to %s (%d bytes)\n"
+        #              % (blobid, fn, len(data)))
+        blobfile.write(data)
+        blobfile.close()
+
+        self.numblobs += 1
+        if self.numblobs % 500 == 0:
+            self.ui.status(b"%d blobs read\n" % self.numblobs)
+
+    def getmode(self, name, fileid):
+        (commitid, blobid) = fileid
+        return self.filemodes[commitid][name]
+
+    def checkpoint_handler(self, cmd):
+        # This command means nothing to us
+        pass
+
+    def _getcommit(self, commitref):
+        """Given a mark reference or a branch name, return the
+        appropriate commit object.  Return None if commitref is a tag
+        or a branch with no commits.  Raises KeyError if anything else
+        is out of whack.
+        """
+        if commitref.startswith(b":"):
+            # KeyError here indicates the input stream is broken.
+            return self.commitmap[commitref]
+        elif commitref.startswith(self.tagprefix):
+            return None
+        else:
+            branch = self._getbranch(commitref)
+            if branch is None:
+                raise ValueError(b"invalid commit ref: %s" % commitref)
+
+            heads = self.branchmap.get(branch)
+            if heads is None:
+                return None
+            else:
+                # KeyError here indicates bad commit id in self.branchmap.
+                return self.commitmap[heads[-1]]
+
+    def _getbranch(self, ref):
+        """Translate a Git head ref to corresponding Mercurial branch
+        name.  E.g. \"refs/heads/foo\" is translated to \"foo\".
+        Special case: \"refs/heads/master\" becomes \"default\".  If
+        'ref' is not a head ref, return None.
+        """
+        prefix = b"refs/heads/"
+        if ref.startswith(prefix):
+            branch = ref[len(prefix):]
+            if branch == b"master":
+                return b"default"
+            else:
+                return branch
+        else:
+            return None
+
+    def commit_handler(self, cmd):
+        # XXX this assumes the fixup branch name used by cvs2git.  In
+        # contrast, git-fast-import(1) recommends "TAG_FIXUP" (not under
+        # refs/heads), and implies that it can be called whatever the
+        # creator of the fastimport dump wants to call it.  So the name
+        # of the fixup branch should be configurable!
+        fixup = (cmd.ref == b"refs/heads/TAG.FIXUP")
+
+        if cmd.ref.startswith(self.tagprefix) and cmd.mark:
+            tag = cmd.ref[len(self.tagprefix):]
+            self.tags.append((tag, b':' + cmd.mark))
+
+        if cmd.from_:
+            first_parent = cmd.from_
+        else:
+            first_parent = self._getcommit(cmd.ref) # commit object
+            if first_parent is not None:
+                first_parent = first_parent.rev     # commit id
+
+        if cmd.merges:
+            if len(cmd.merges) > 1:
+                raise NotImplementedError("Can't handle more than two parents")
+            second_parent = cmd.merges[0]
+        else:
+            second_parent = None
+
+        if first_parent is None and second_parent is not None:
+            # First commit on a new branch that has 'merge' but no 'from':
+            # special case meaning branch starts with no files; the contents of
+            # the first commit (this one) determine the list of files at branch
+            # time.
+            first_parent = second_parent
+            second_parent = None
+            no_files = True             # XXX this is ignored...
+
+        bfirst_parent = first_parent or b''
+        bsecond_parent = second_parent or b''
+        self.ui.debug(b"commit %s: first_parent = %s, second_parent = %s\n"
+                      % (cmd, bfirst_parent, bsecond_parent))
+        assert ((first_parent != second_parent) or
+                (first_parent is second_parent is None)), \
+               (b"commit %s: first_parent == second parent = %s"
+                % (cmd, bfirst_parent))
+
+        # Figure out the Mercurial branch name.
+        if fixup and first_parent is not None:
+            # If this is a fixup commit, pretend it happened on the same
+            # branch as its first parent.  (We don't want a Mercurial
+            # named branch called "TAG.FIXUP" in the output repository.)
+            branch = self.commitmap[first_parent].branch
+        else:
+            branch = self._getbranch(cmd.ref)
+
+        commit_handler = HgImportCommitHandler(
+            self, cmd, self.ui)
+        commit_handler.process()
+        self.modified[cmd.id] = commit_handler.modified
+        self.filemodes[cmd.id] = commit_handler.mode
+        self.copies[cmd.id] = commit_handler.copies
+
+        # in case we are converting from git or bzr, prefer author but
+        # fallback to committer (committer is required, author is
+        # optional)
+        userinfo = cmd.author or cmd.committer
+        if userinfo[0] == userinfo[1]:
+            # In order to conform to fastimport syntax, cvs2git with no
+            # authormap produces author names like "jsmith <jsmith>"; if
+            # we see that, revert to plain old "jsmith".
+            user = userinfo[0]
+        else:
+            user = b"%s <%s>" % (userinfo[0], userinfo[1])
+
+        text = cmd.message
+        date = self.convert_date(userinfo)
+        parents = []
+        if first_parent:
+            parents.append(first_parent)
+        if second_parent:
+            parents.append(second_parent)
+
+        commit = common.commit(user, date, text, parents, branch,
+                               rev=cmd.id, sortkey=int(cmd.id[1:]))
+
+        self.commitmap[cmd.id] = commit
+        heads = self.branchmap.get(branch)
+        if heads is None:
+            heads = [cmd.id]
+        else:
+            # adding to an existing branch: replace the previous head
+            try:
+                heads.remove(first_parent)
+            except ValueError:          # first parent not a head: no problem
+                pass
+            heads.append(cmd.id)        # at end means this is tipmost
+        self.branchmap[branch] = heads
+        self.ui.debug(b"processed commit %s\n" % cmd)
+
+    def convert_date(self, c):
+        res = (int(c[2]), -int(c[3]))
+        #print c, res
+        #print type((0, 0)), type(res), len(res), type(res) is type((0, 0))
+        #if type(res) is type((0, 0)) and len(res) == 2:
+        #    print "go for it"
+        #return res
+        return b"%d %d" % res
+        
+    def reset_handler(self, cmd):
+        branch = self._getbranch(cmd.ref)
+        if branch:
+            # The usual case for 'reset': (re)create the named branch.
+            # XXX what should we do if cmd.from_ is None?
+            if cmd.from_ is not None:
+                self.branchmap[branch] = [cmd.from_]
+            else:
+                # pretend the branch never existed... is this right?!?
+                try:
+                    del self.branchmap[branch]
+                except KeyError:
+                    pass
+            #else:
+            #    # XXX filename? line number?
+            #    self.ui.warn("ignoring branch reset with no 'from'\n")
+        elif cmd.ref.startswith(self.tagprefix):
+            # Create a "lightweight tag" in Git terms.  As I understand
+            # it, that's a tag with no description and no history --
+            # rather like CVS tags.  cvs2git turns CVS tags into Git
+            # lightweight tags, so we should make sure they become
+            # Mercurial tags.  But we don't have to fake a history for
+            # them; save them up for the end.
+            if cmd.from_ is not None:
+                tag = cmd.ref[len(self.tagprefix):]
+                self.tags.append((tag, cmd.from_))
+
+    def tag_handler(self, cmd):
+        pass
+
+
+class HgImportCommitHandler(processor.CommitHandler):
+
+    def __init__(self, parent, command, ui):
+        self.parent = parent            # HgImportProcessor running the show
+        self.command = command          # CommitCommand that we're processing
+        self.ui = ui
+
+        # Files changes by this commit as a list of (filename, id)
+        # tuples where id is (commitid, blobid).  The blobid is
+        # needed to fetch the file's contents later, and the commitid
+        # is needed to fetch the mode.
+        # (XXX what about inline file contents?)
+        # (XXX how to describe deleted files?)
+        self.modified = []
+
+        # mode of files listed in self.modified: '', 'x', or 'l'
+        self.mode = {}
+
+        # dictionary of src: dest (renamed files are in here and self.modified)
+        self.copies = {}
+
+        # number of inline files seen in this commit
+        self.inlinecount = 0
+        
+    def modify_handler(self, filecmd):
+        if filecmd.dataref:
+            blobid = filecmd.dataref    # blobid is the mark of the blob
+        else:
+            blobid = b"%s-inline:%d" % (self.command.id, self.inlinecount)
+            assert filecmd.data is not None
+            self.parent.writeblob(blobid, filecmd.data)
+            self.inlinecount += 1
+
+        fileid = (self.command.id, blobid)
+
+        self.modified.append((filecmd.path, fileid))
+        if stat.S_ISLNK(filecmd.mode): # link
+            mode = b'l'
+        elif filecmd.mode & 0o111: # executable
+            mode = b'x'
+        elif stat.S_ISREG(filecmd.mode): # regular file
+            mode = b''
+        else:
+            raise RuntimeError(b"mode %s unsupported" % filecmd.mode)
+
+        self.mode[filecmd.path] = mode
+
+    def delete_handler(self, filecmd):
+        self.modified.append((filecmd.path, None))
+
+    def copy_handler(self, filecmd):
+        self.copies[filecmd.src_path] = filecmd.dest_path
+
+    def rename_handler(self, filecmd):
+        # copy oldname to newname and delete oldname
+        self.copies[filecmd.new_path] = filecmd.old_path
+        self.modified.append((filecmd.old_path, None))