diff hgfastimport/hgimport.py @ 47:7ff36dc9f0b1

Massive rework to use infrastructure provided by convert extension. fastimport no longer stages changes in the repository's working copy; instead, it now works like any other convert source: the imported history is kept in memory (except for file contents) and then processed by the 'convert' extension.
author Greg Ward <greg-hg@gerg.ca>
date Sat, 16 May 2009 12:57:22 -0400
parents 93c2b1e832bd
children 1cf21a8c274b
line wrap: on
line diff
--- a/hgfastimport/hgimport.py	Sun May 10 14:16:02 2009 -0400
+++ b/hgfastimport/hgimport.py	Sat May 16 12:57:22 2009 -0400
@@ -21,29 +21,119 @@
 """
 
 import os
-import os.path
-import errno
 import shutil
 
-import mercurial.hg
-import mercurial.commands
-from mercurial import util
-from mercurial.node import nullrev, hex
+from hgext.convert import common
+
+from fastimport import processor, parser
+
+class fastimport_source(common.converter_source):
+    """Interface between the fastimport processor below and Mercurial's
+    normal conversion infrastructure.
+    """
+    def __init__(self, ui, repo, sources):
+        self.ui = ui
+        self.sources = sources
+        self.processor = HgImportProcessor(ui, repo)
+        self.parsed = False
+
+    # converter_source methods
+
+    def before(self):
+        self.processor.setup()
+
+    def after(self):
+        self.processor.teardown()
+
+    def getheads(self):
+        """Return a list of this repository's heads"""
+        self._parse()
+        allheads = []
+        for branchheads in self.processor.branchmap.values():
+            allheads.extend(branchheads)
+        return allheads
+
+    def getfile(self, name, fileid):
+        """Return file contents as a string. rev is the identifier returned
+        by a previous call to getchanges().
+        """
+        if fileid is None:              # deleted file
+            raise IOError
+        return self.processor.getblob(fileid)
+
+    def getmode(self, name, fileid):
+        """Return file mode, eg. '', 'x', or 'l'. rev is the identifier
+        returned by a previous call to getchanges().
+        """
+        return self.processor.getmode(name, fileid)
+
+    def getchanges(self, commitid):
+        """Returns a tuple of (files, copies).
 
-from fastimport import processor
-from hgfastimport import hgechoprocessor
+        files is a sorted list of (filename, id) tuples for all files
+        changed between commitid and its first parent returned by
+        getcommit(). id is the source revision id of the file.
+
+        copies is a dictionary of dest: source
+        """
+        return (self.processor.modified[commitid],
+                self.processor.copies[commitid])
+
+    def getcommit(self, commitid):
+        """Return the commit object for commitid"""
+        if commitid is None:
+            return None
+        else:
+            return self.processor.commitmap[commitid]
+
+    def gettags(self):
+        """Return the tags as a dictionary of name: revision"""
+        return dict(self.processor.tags)
+    
+    def getchangedfiles(self, rev, i):
+        """Return the files changed by rev compared to parent[i].
+
+        i is an index selecting one of the parents of rev.  The return
+        value should be the list of files that are different in rev and
+        this parent.
+
+        If rev has no parents, i is None.
+
+        This function is only needed to support --filemap
+        """
+        raise NotImplementedError()
+
+    # private worker methods
+
+    def _parse(self):
+        if self.parsed:
+            return
+
+        for source in self.sources:
+            self.ui.debug("reading fastimport source: %s\n" % source)
+            f = open(source)
+            p = parser.ImportParser(f)
+            self.processor.process(p.iter_commands)
+            f.close()
+
+        self.parsed = True
 
 class HgImportProcessor(processor.ImportProcessor):
     
-    def __init__(self, ui, repo, **opts):
+    def __init__(self, ui, repo):
         super(HgImportProcessor, self).__init__()
         self.ui = ui
         self.repo = repo
-        self.opts = opts
-        self.last_commit = None         # CommitCommand object
-        self.mark_map = {}              # map mark (e.g. ":1") to revision number
-        self.branch_map = {}            # map git branch name to revision number
-        self.lightweight_tags = []      # list of (ref, mark) tuples
+
+        self.commitmap = {}             # map commit ID (":1") to commit object
+        self.branchmap = {}             # map branch name to list of heads
+
+        # see HgImportCommitHandler for details on these three
+        self.modified = {}              # map commit id to list of file mods
+        self.filemodes = {}             # map commit id to {filename: mode} map
+        self.copies = {}                # map commit id to dict of file copies
+
+        self.tags = []                  # list of (tag, mark) tuples
 
         self.numblobs = 0               # for progress reporting
         self.blobdir = None
@@ -54,13 +144,6 @@
 
     def teardown(self):
         """Cleanup after processing all streams."""
-        # Hmmm: this isn't really a cleanup step, it's a post-processing
-        # step.  But we currently have one processor per input
-        # stream... despite the fact that state like mark_map,
-        # branch_map, and lightweight_tags really should span input
-        # streams.
-        self.write_lightweight_tags()
-
         if self.blobdir and os.path.exists(self.blobdir):
             self.ui.status("Removing blob dir %r ...\n" % self.blobdir)
             shutil.rmtree(self.blobdir)
@@ -69,38 +152,83 @@
         self.ui.write("Progress: %s\n" % cmd.message)
 
     def blob_handler(self, cmd):
+        self.writeblob(cmd.id, cmd.data)
+
+    def _getblobfilename(self, blobid):
+        if self.blobdir is None:
+            raise RuntimeError("no blobs seen, so no blob directory created")
+        # XXX should escape ":" for windows
+        return os.path.join(self.blobdir, "blob-" + blobid)
+
+    def getblob(self, fileid):
+        (commitid, blobid) = fileid
+        f = open(self._getblobfilename(blobid), "rb")
+        try:
+            return f.read()
+        finally:
+            f.close()
+
+    def writeblob(self, blobid, data):
         if self.blobdir is None:        # no blobs seen yet
-            # XXX cleanup?
             self.blobdir = os.path.join(self.repo.root, ".hg", "blobs")
             os.mkdir(self.blobdir)
 
-        fn = self.getblobfilename(cmd.id)
+        fn = self._getblobfilename(blobid)
         blobfile = open(fn, "wb")
         #self.ui.debug("writing blob %s to %s (%d bytes)\n"
-        #              % (cmd.id, fn, len(cmd.data)))
-        blobfile.write(cmd.data)
+        #              % (blobid, fn, len(data)))
+        blobfile.write(data)
         blobfile.close()
 
         self.numblobs += 1
         if self.numblobs % 500 == 0:
             self.ui.status("%d blobs read\n" % self.numblobs)
 
-    def getblobfilename(self, blobid):
-        if self.blobdir is None:
-            raise RuntimeError("no blobs seen, so no blob directory created")
-        # XXX should escape ":" for windows
-        return os.path.join(self.blobdir, "blob-" + blobid)
+    def getmode(self, name, fileid):
+        (commitid, blobid) = fileid
+        return self.filemodes[commitid][name]
 
     def checkpoint_handler(self, cmd):
         # This command means nothing to us
         pass
 
-    def committish_rev(self, committish):
+    def _getcommit(self, committish):
+        """Given a mark reference or a branch name, return the
+        appropriate commit object.  Return None if committish is a
+        branch with no commits.  Raises KeyError if anything else is out
+        of whack.
+        """
         if committish.startswith(":"):
-            return self.mark_map[committish]
+            # KeyError here indicates the input stream is broken.
+            return self.commitmap[committish]
         else:
-            return self.branch_map[committish]
-        
+            branch = self._getbranch(committish)
+            if branch is None:
+                raise ValueError("invalid committish: %r" % committish)
+
+            heads = self.branchmap.get(branch)
+            if heads is None:
+                return None
+            else:
+                # KeyError here indicates bad commit id in self.branchmap.
+                return self.commitmap[heads[-1]]
+
+    def _getbranch(self, ref):
+        """Translate a Git head ref to corresponding Mercurial branch
+        name.  E.g. \"refs/heads/foo\" is translated to \"foo\".
+        Special case: \"refs/heads/master\" becomes \"default\".  If
+        'ref' is not a head ref, return None.
+        """
+        prefix = "refs/heads/"
+        if ref.startswith(prefix):
+            branch = ref[len(prefix):]
+            if branch == "master":
+                return "default"
+            else:
+                return branch
+        else:
+            return None
+
     def commit_handler(self, cmd):
         # XXX this assumes the fixup branch name used by cvs2git.  In
         # contrast, git-fast-import(1) recommends "TAG_FIXUP" (not under
@@ -110,76 +238,83 @@
         fixup = (cmd.ref == "refs/heads/TAG.FIXUP")
 
         if cmd.from_:
-            first_parent = self.committish_rev(cmd.from_)
+            first_parent = cmd.from_
         else:
-            first_parent = self.branch_map.get(cmd.ref, nullrev)
+            first_parent = self._getcommit(cmd.ref) # commit object
+            if first_parent is not None:
+                first_parent = first_parent.rev     # commit id
+
         if cmd.merges:
             if len(cmd.merges) > 1:
                 raise NotImplementedError("Can't handle more than two parents")
-            second_parent = self.committish_rev(cmd.merges[0])
+            second_parent = cmd.merges[0]
         else:
-            second_parent = nullrev
+            second_parent = None
 
-        if first_parent is nullrev and second_parent is not nullrev:
+        if first_parent is None and second_parent is not None:
             # First commit on a new branch that has 'merge' but no 'from':
             # special case meaning branch starts with no files; the contents of
             # the first commit (this one) determine the list of files at branch
             # time.
             first_parent = second_parent
-            second_parent = nullrev
-            no_files = True             # XXX not handled
+            second_parent = None
+            no_files = True             # XXX this is ignored...
 
         self.ui.debug("commit %s: first_parent = %r, second_parent = %r\n"
-                      % (cmd.id, first_parent, second_parent))
+                      % (cmd, first_parent, second_parent))
         assert ((first_parent != second_parent) or
-                (first_parent == second_parent == -1)), \
+                (first_parent is second_parent is None)), \
                ("commit %s: first_parent == second parent = %r"
-                % (cmd.id, first_parent))
-
-        # Update to the first parent
-        mercurial.hg.clean(self.repo, self.repo.lookup(first_parent))
-        mercurial.commands.debugsetparents(
-            self.ui, self.repo, first_parent, second_parent)
+                % (cmd, first_parent))
 
-        if cmd.ref == "refs/heads/master":
-            branch = "default"
-        elif fixup and first_parent is not nullrev:
-            # If this is a fixup commit, pretend it happened on the same branch
-            # as its first parent.  (We don't want a Mercurial named branch
-            # called "TAG.FIXUP" in the output repository.)
-            branch = self.repo.changectx(first_parent).branch()
+        # Figure out the Mercurial branch name.
+        if fixup and first_parent is not None:
+            # If this is a fixup commit, pretend it happened on the same
+            # branch as its first parent.  (We don't want a Mercurial
+            # named branch called "TAG.FIXUP" in the output repository.)
+            branch = self.commitmap[first_parent].branch
         else:
-            branch = cmd.ref[len("refs/heads/"):]
-        self.repo.dirstate.setbranch(branch)
+            branch = self._getbranch(cmd.ref)
+
         commit_handler = HgImportCommitHandler(
-            self, cmd, self.ui, self.repo, **self.opts)
+            self, cmd, self.ui)
         commit_handler.process()
+        self.modified[cmd.id] = commit_handler.modified
+        self.filemodes[cmd.id] = commit_handler.mode
+        self.copies[cmd.id] = commit_handler.copies
 
         # in case we are converting from git or bzr, prefer author but
         # fallback to committer (committer is required, author is
         # optional)
         userinfo = cmd.author or cmd.committer
-        user = "%s <%s>" % (userinfo[0], userinfo[1])
+        if userinfo[0] == userinfo[1]:
+            # In order to conform to fastimport syntax, cvs2git with no
+            # authormap produces author names like "jsmith <jsmith>"; if
+            # we see that, revert to plain old "jsmith".
+            user = userinfo[0]
+        else:
+            user = "%s <%s>" % (userinfo[0], userinfo[1])
 
-        # Blech: have to monkeypatch mercurial.encoding to ensure that
-        # everything under rawcommit() assumes the same encoding,
-        # regardless of current locale.
-        from mercurial import encoding
-        encoding.encoding = "UTF-8"
-
-        files = commit_handler.filelist()
         assert type(cmd.message) is unicode
-        text = cmd.message.encode("utf-8") # XXX cmd.message is unicode
+        text = cmd.message.encode("utf-8")
         date = self.convert_date(userinfo)
-        node = self.repo.rawcommit(
-            files=files, text=text, user=user, date=date)
-        rev = self.repo.changelog.rev(node)
-        if cmd.mark is not None:
-            self.mark_map[":" + cmd.mark] = rev
-        if not fixup:
-            self.branch_map[cmd.ref] = rev
-            self.last_commit = cmd
-        self.ui.write("Done commit of rev %d\n" % rev)
+
+        parents = filter(None, [first_parent, second_parent])
+        commit = common.commit(user, date, text, parents, branch, rev=cmd.id)
+
+        self.commitmap[cmd.id] = commit
+        heads = self.branchmap.get(branch)
+        if heads is None:
+            heads = [cmd.id]
+        else:
+            # adding to an existing branch: replace the previous head
+            try:
+                heads.remove(first_parent)
+            except ValueError:          # first parent not a head: no problem
+                pass
+            heads.append(cmd.id)        # at end means this is tipmost
+        self.branchmap[branch] = heads
+        self.ui.debug("processed commit %s\n" % cmd)
 
     def convert_date(self, c):
         res = (int(c[2]), int(c[3]))
@@ -191,119 +326,90 @@
         return "%d %d" % res
         
     def reset_handler(self, cmd):
-        if cmd.ref.startswith("refs/heads/"):
+        tagprefix = "refs/tags/"
+        branch = self._getbranch(cmd.ref)
+        if branch:
             # The usual case for 'reset': (re)create the named branch.
             # XXX what should we do if cmd.from_ is None?
             if cmd.from_ is not None:
-                self.branch_map[cmd.ref] = self.committish_rev(cmd.from_)
+                self.branchmap[branch] = [cmd.from_]
             else:
                 # pretend the branch never existed... is this right?!?
                 try:
-                    del self.branch_map[cmd.ref]
+                    del self.branchmap[branch]
                 except KeyError:
                     pass
             #else:
             #    # XXX filename? line number?
             #    self.ui.warn("ignoring branch reset with no 'from'\n")
-        elif cmd.ref.startswith("refs/tags/"):
+        elif cmd.ref.startswith(tagprefix):
             # Create a "lightweight tag" in Git terms.  As I understand
             # it, that's a tag with no description and no history --
             # rather like CVS tags.  cvs2git turns CVS tags into Git
             # lightweight tags, so we should make sure they become
             # Mercurial tags.  But we don't have to fake a history for
             # them; save them up for the end.
-            self.lightweight_tags.append((cmd.ref, cmd.from_))
+            tag = cmd.ref[len(tagprefix):]
+            self.tags.append((tag, cmd.from_))
 
     def tag_handler(self, cmd):
         pass
 
-    def write_lightweight_tags(self):
-        if not self.lightweight_tags:   # avoid writing empty .hgtags
-            return
-
-        # XXX what about duplicate tags?  lightweight_tags is
-        # deliberately a list, to preserve order ... but do we need to
-        # worry about repeated tags?  (Certainly not for cvs2git output,
-        # since CVS has no tag history.)
-
-        # Create Mercurial tags from git-style "lightweight tags" in the
-        # input stream.
-        self.ui.status("updating tags\n")
-        mercurial.hg.clean(self.repo, self.repo.lookup("default"))
-        tagfile = open(self.repo.wjoin(".hgtags"), "ab")
-        for (ref, mark) in self.lightweight_tags:
-            tag = ref[len("refs/tags/"):]
-            rev = self.mark_map[mark]
-            node = self.repo.changelog.node(rev)
-            tagfile.write("%s %s\n" % (hex(node), tag))
-        tagfile.close()
-
-        files = [".hgtags"]
-        self.repo.rawcommit(
-            files=files, text="update tags", user="convert-repo", date=None)
 
 class HgImportCommitHandler(processor.CommitHandler):
 
-    def __init__(self, parent, command, ui, repo, **opts):
+    def __init__(self, parent, command, ui):
         self.parent = parent            # HgImportProcessor running the show
-        self.command = command
+        self.command = command          # CommitCommand that we're processing
         self.ui = ui
-        self.repo = repo
-        self.opts = opts
-        self.files = set()
+
+        # Files changes by this commit as a list of (filename, id)
+        # tuples where id is (commitid, blobid).  The blobid is
+        # needed to fetch the file's contents later, and the commitid
+        # is needed to fetch the mode.
+        # (XXX what about inline file contents?)
+        # (XXX how to describe deleted files?)
+        self.modified = []
 
-    def _make_container(self, path):
-        if '/' in path:
-            d = os.path.dirname(path)
-            if not os.path.isdir(d):
-                os.makedirs(d)
+        # mode of files listed in self.modified: '', 'x', or 'l'
+        self.mode = {}
+
+        # dictionary of src: dest (renamed files are in here and self.modified)
+        self.copies = {}
+
+        # number of inline files seen in this commit
+        self.inlinecount = 0
         
     def modify_handler(self, filecmd):
-        #print "============================" + filecmd.path
-        # FIXME: handle mode
-        self.files.add(filecmd.path)
-        fullpath = os.path.join(self.repo.root, filecmd.path)
-        self._make_container(fullpath)
-        #print "made dirs, writing file"
         if filecmd.dataref:
-            # reference to a blob that has already appeared in the stream
-            fn = self.parent.getblobfilename(filecmd.dataref)
-            if os.path.exists(fullpath):
-                os.remove(fullpath)
-            try:
-                os.link(fn, fullpath)
-            except OSError, err:
-                if err.errno == errno.ENOENT:
-                    # if this happens, it's a problem in the fast-import
-                    # stream
-                    raise util.Abort("bad blob ref %r (no such file %s)"
-                                     % (filecmd.dataref, fn))
-                else:
-                    # anything else is a bug in this extension
-                    # (cross-device move, permissions, etc.)
-                    raise
-        elif filecmd.data:
-            f = open(fullpath, "w")
-            f.write(filecmd.data)
-            f.close()
+            blobid = filecmd.dataref    # blobid is the mark of the blob
         else:
-            raise RuntimeError("either filecmd.dataref or filecmd.data must be set")
-        #print self.repo.add([filecmd.path])
-        #print "Done:", filecmd.path
+            blobid = "%s-inline:%d" % (self.command.id, self.inlinecount)
+            assert filecmd.data is not None
+            self.parent.writeblob(blobid, filecmd.data)
+            self.inlinecount += 1
+
+        fileid = (self.command.id, blobid)
+
+        self.modified.append((filecmd.path, fileid))
+        if filecmd.mode.endswith("644"): # normal file
+            mode = ''
+        elif filecmd.mode.endswith("755"): # executable
+            mode = 'x'
+        elif filecmd.mode == "120000":  # symlink
+            mode = 'l'
+        else:
+            raise RuntimeError("mode %r unsupported" % filecmd.mode)
+
+        self.mode[filecmd.path] = mode
 
     def delete_handler(self, filecmd):
-        self.files.add(filecmd.path)
-        self.repo.remove([filecmd.path], unlink=True)
+        self.modified.append((filecmd.path, None))
 
-    #def copy_handler(self, filecmd):
-    #    self.files.add(filecmd.path)
-    #    """Handle a filecopy command."""
-    #    self.ui.write("Cmd: %s\n" % repr(filecmd))
+    def copy_handler(self, filecmd):
+        self.copies[filecmd.src_path] = filecmd.dest_path
 
-    #def rename_handler(self, filecmd):
-    #    self.files.add(filecmd.path)
-    #    """Handle a filerename command."""
-    #    self.ui.write("Cmd: %s\n" % repr(filecmd))
-
-    def filelist(self):
-        return list(self.files)
+    def rename_handler(self, filecmd):
+        # copy oldname to newname and delete oldname
+        self.copies[filecmd.oldname] = filecmd.newname
+        self.files.append((filecmd.path, None))