Mercurial > hg > hg-fastimport
diff hgfastimport/hgimport.py @ 47:7ff36dc9f0b1
Massive rework to use infrastructure provided by convert extension.
fastimport no longer stages changes in the repository's working copy;
instead, it now works like any other convert source: the imported
history is kept in memory (except for file contents) and then
processed by the 'convert' extension.
| author | Greg Ward <greg-hg@gerg.ca> |
|---|---|
| date | Sat, 16 May 2009 12:57:22 -0400 |
| parents | 93c2b1e832bd |
| children | 1cf21a8c274b |
line wrap: on
line diff
--- a/hgfastimport/hgimport.py Sun May 10 14:16:02 2009 -0400 +++ b/hgfastimport/hgimport.py Sat May 16 12:57:22 2009 -0400 @@ -21,29 +21,119 @@ """ import os -import os.path -import errno import shutil -import mercurial.hg -import mercurial.commands -from mercurial import util -from mercurial.node import nullrev, hex +from hgext.convert import common + +from fastimport import processor, parser + +class fastimport_source(common.converter_source): + """Interface between the fastimport processor below and Mercurial's + normal conversion infrastructure. + """ + def __init__(self, ui, repo, sources): + self.ui = ui + self.sources = sources + self.processor = HgImportProcessor(ui, repo) + self.parsed = False + + # converter_source methods + + def before(self): + self.processor.setup() + + def after(self): + self.processor.teardown() + + def getheads(self): + """Return a list of this repository's heads""" + self._parse() + allheads = [] + for branchheads in self.processor.branchmap.values(): + allheads.extend(branchheads) + return allheads + + def getfile(self, name, fileid): + """Return file contents as a string. rev is the identifier returned + by a previous call to getchanges(). + """ + if fileid is None: # deleted file + raise IOError + return self.processor.getblob(fileid) + + def getmode(self, name, fileid): + """Return file mode, eg. '', 'x', or 'l'. rev is the identifier + returned by a previous call to getchanges(). + """ + return self.processor.getmode(name, fileid) + + def getchanges(self, commitid): + """Returns a tuple of (files, copies). -from fastimport import processor -from hgfastimport import hgechoprocessor + files is a sorted list of (filename, id) tuples for all files + changed between commitid and its first parent returned by + getcommit(). id is the source revision id of the file. + + copies is a dictionary of dest: source + """ + return (self.processor.modified[commitid], + self.processor.copies[commitid]) + + def getcommit(self, commitid): + """Return the commit object for commitid""" + if commitid is None: + return None + else: + return self.processor.commitmap[commitid] + + def gettags(self): + """Return the tags as a dictionary of name: revision""" + return dict(self.processor.tags) + + def getchangedfiles(self, rev, i): + """Return the files changed by rev compared to parent[i]. + + i is an index selecting one of the parents of rev. The return + value should be the list of files that are different in rev and + this parent. + + If rev has no parents, i is None. + + This function is only needed to support --filemap + """ + raise NotImplementedError() + + # private worker methods + + def _parse(self): + if self.parsed: + return + + for source in self.sources: + self.ui.debug("reading fastimport source: %s\n" % source) + f = open(source) + p = parser.ImportParser(f) + self.processor.process(p.iter_commands) + f.close() + + self.parsed = True class HgImportProcessor(processor.ImportProcessor): - def __init__(self, ui, repo, **opts): + def __init__(self, ui, repo): super(HgImportProcessor, self).__init__() self.ui = ui self.repo = repo - self.opts = opts - self.last_commit = None # CommitCommand object - self.mark_map = {} # map mark (e.g. ":1") to revision number - self.branch_map = {} # map git branch name to revision number - self.lightweight_tags = [] # list of (ref, mark) tuples + + self.commitmap = {} # map commit ID (":1") to commit object + self.branchmap = {} # map branch name to list of heads + + # see HgImportCommitHandler for details on these three + self.modified = {} # map commit id to list of file mods + self.filemodes = {} # map commit id to {filename: mode} map + self.copies = {} # map commit id to dict of file copies + + self.tags = [] # list of (tag, mark) tuples self.numblobs = 0 # for progress reporting self.blobdir = None @@ -54,13 +144,6 @@ def teardown(self): """Cleanup after processing all streams.""" - # Hmmm: this isn't really a cleanup step, it's a post-processing - # step. But we currently have one processor per input - # stream... despite the fact that state like mark_map, - # branch_map, and lightweight_tags really should span input - # streams. - self.write_lightweight_tags() - if self.blobdir and os.path.exists(self.blobdir): self.ui.status("Removing blob dir %r ...\n" % self.blobdir) shutil.rmtree(self.blobdir) @@ -69,38 +152,83 @@ self.ui.write("Progress: %s\n" % cmd.message) def blob_handler(self, cmd): + self.writeblob(cmd.id, cmd.data) + + def _getblobfilename(self, blobid): + if self.blobdir is None: + raise RuntimeError("no blobs seen, so no blob directory created") + # XXX should escape ":" for windows + return os.path.join(self.blobdir, "blob-" + blobid) + + def getblob(self, fileid): + (commitid, blobid) = fileid + f = open(self._getblobfilename(blobid), "rb") + try: + return f.read() + finally: + f.close() + + def writeblob(self, blobid, data): if self.blobdir is None: # no blobs seen yet - # XXX cleanup? self.blobdir = os.path.join(self.repo.root, ".hg", "blobs") os.mkdir(self.blobdir) - fn = self.getblobfilename(cmd.id) + fn = self._getblobfilename(blobid) blobfile = open(fn, "wb") #self.ui.debug("writing blob %s to %s (%d bytes)\n" - # % (cmd.id, fn, len(cmd.data))) - blobfile.write(cmd.data) + # % (blobid, fn, len(data))) + blobfile.write(data) blobfile.close() self.numblobs += 1 if self.numblobs % 500 == 0: self.ui.status("%d blobs read\n" % self.numblobs) - def getblobfilename(self, blobid): - if self.blobdir is None: - raise RuntimeError("no blobs seen, so no blob directory created") - # XXX should escape ":" for windows - return os.path.join(self.blobdir, "blob-" + blobid) + def getmode(self, name, fileid): + (commitid, blobid) = fileid + return self.filemodes[commitid][name] def checkpoint_handler(self, cmd): # This command means nothing to us pass - def committish_rev(self, committish): + def _getcommit(self, committish): + """Given a mark reference or a branch name, return the + appropriate commit object. Return None if committish is a + branch with no commits. Raises KeyError if anything else is out + of whack. + """ if committish.startswith(":"): - return self.mark_map[committish] + # KeyError here indicates the input stream is broken. + return self.commitmap[committish] else: - return self.branch_map[committish] - + branch = self._getbranch(committish) + if branch is None: + raise ValueError("invalid committish: %r" % committish) + + heads = self.branchmap.get(branch) + if heads is None: + return None + else: + # KeyError here indicates bad commit id in self.branchmap. + return self.commitmap[heads[-1]] + + def _getbranch(self, ref): + """Translate a Git head ref to corresponding Mercurial branch + name. E.g. \"refs/heads/foo\" is translated to \"foo\". + Special case: \"refs/heads/master\" becomes \"default\". If + 'ref' is not a head ref, return None. + """ + prefix = "refs/heads/" + if ref.startswith(prefix): + branch = ref[len(prefix):] + if branch == "master": + return "default" + else: + return branch + else: + return None + def commit_handler(self, cmd): # XXX this assumes the fixup branch name used by cvs2git. In # contrast, git-fast-import(1) recommends "TAG_FIXUP" (not under @@ -110,76 +238,83 @@ fixup = (cmd.ref == "refs/heads/TAG.FIXUP") if cmd.from_: - first_parent = self.committish_rev(cmd.from_) + first_parent = cmd.from_ else: - first_parent = self.branch_map.get(cmd.ref, nullrev) + first_parent = self._getcommit(cmd.ref) # commit object + if first_parent is not None: + first_parent = first_parent.rev # commit id + if cmd.merges: if len(cmd.merges) > 1: raise NotImplementedError("Can't handle more than two parents") - second_parent = self.committish_rev(cmd.merges[0]) + second_parent = cmd.merges[0] else: - second_parent = nullrev + second_parent = None - if first_parent is nullrev and second_parent is not nullrev: + if first_parent is None and second_parent is not None: # First commit on a new branch that has 'merge' but no 'from': # special case meaning branch starts with no files; the contents of # the first commit (this one) determine the list of files at branch # time. first_parent = second_parent - second_parent = nullrev - no_files = True # XXX not handled + second_parent = None + no_files = True # XXX this is ignored... self.ui.debug("commit %s: first_parent = %r, second_parent = %r\n" - % (cmd.id, first_parent, second_parent)) + % (cmd, first_parent, second_parent)) assert ((first_parent != second_parent) or - (first_parent == second_parent == -1)), \ + (first_parent is second_parent is None)), \ ("commit %s: first_parent == second parent = %r" - % (cmd.id, first_parent)) - - # Update to the first parent - mercurial.hg.clean(self.repo, self.repo.lookup(first_parent)) - mercurial.commands.debugsetparents( - self.ui, self.repo, first_parent, second_parent) + % (cmd, first_parent)) - if cmd.ref == "refs/heads/master": - branch = "default" - elif fixup and first_parent is not nullrev: - # If this is a fixup commit, pretend it happened on the same branch - # as its first parent. (We don't want a Mercurial named branch - # called "TAG.FIXUP" in the output repository.) - branch = self.repo.changectx(first_parent).branch() + # Figure out the Mercurial branch name. + if fixup and first_parent is not None: + # If this is a fixup commit, pretend it happened on the same + # branch as its first parent. (We don't want a Mercurial + # named branch called "TAG.FIXUP" in the output repository.) + branch = self.commitmap[first_parent].branch else: - branch = cmd.ref[len("refs/heads/"):] - self.repo.dirstate.setbranch(branch) + branch = self._getbranch(cmd.ref) + commit_handler = HgImportCommitHandler( - self, cmd, self.ui, self.repo, **self.opts) + self, cmd, self.ui) commit_handler.process() + self.modified[cmd.id] = commit_handler.modified + self.filemodes[cmd.id] = commit_handler.mode + self.copies[cmd.id] = commit_handler.copies # in case we are converting from git or bzr, prefer author but # fallback to committer (committer is required, author is # optional) userinfo = cmd.author or cmd.committer - user = "%s <%s>" % (userinfo[0], userinfo[1]) + if userinfo[0] == userinfo[1]: + # In order to conform to fastimport syntax, cvs2git with no + # authormap produces author names like "jsmith <jsmith>"; if + # we see that, revert to plain old "jsmith". + user = userinfo[0] + else: + user = "%s <%s>" % (userinfo[0], userinfo[1]) - # Blech: have to monkeypatch mercurial.encoding to ensure that - # everything under rawcommit() assumes the same encoding, - # regardless of current locale. - from mercurial import encoding - encoding.encoding = "UTF-8" - - files = commit_handler.filelist() assert type(cmd.message) is unicode - text = cmd.message.encode("utf-8") # XXX cmd.message is unicode + text = cmd.message.encode("utf-8") date = self.convert_date(userinfo) - node = self.repo.rawcommit( - files=files, text=text, user=user, date=date) - rev = self.repo.changelog.rev(node) - if cmd.mark is not None: - self.mark_map[":" + cmd.mark] = rev - if not fixup: - self.branch_map[cmd.ref] = rev - self.last_commit = cmd - self.ui.write("Done commit of rev %d\n" % rev) + + parents = filter(None, [first_parent, second_parent]) + commit = common.commit(user, date, text, parents, branch, rev=cmd.id) + + self.commitmap[cmd.id] = commit + heads = self.branchmap.get(branch) + if heads is None: + heads = [cmd.id] + else: + # adding to an existing branch: replace the previous head + try: + heads.remove(first_parent) + except ValueError: # first parent not a head: no problem + pass + heads.append(cmd.id) # at end means this is tipmost + self.branchmap[branch] = heads + self.ui.debug("processed commit %s\n" % cmd) def convert_date(self, c): res = (int(c[2]), int(c[3])) @@ -191,119 +326,90 @@ return "%d %d" % res def reset_handler(self, cmd): - if cmd.ref.startswith("refs/heads/"): + tagprefix = "refs/tags/" + branch = self._getbranch(cmd.ref) + if branch: # The usual case for 'reset': (re)create the named branch. # XXX what should we do if cmd.from_ is None? if cmd.from_ is not None: - self.branch_map[cmd.ref] = self.committish_rev(cmd.from_) + self.branchmap[branch] = [cmd.from_] else: # pretend the branch never existed... is this right?!? try: - del self.branch_map[cmd.ref] + del self.branchmap[branch] except KeyError: pass #else: # # XXX filename? line number? # self.ui.warn("ignoring branch reset with no 'from'\n") - elif cmd.ref.startswith("refs/tags/"): + elif cmd.ref.startswith(tagprefix): # Create a "lightweight tag" in Git terms. As I understand # it, that's a tag with no description and no history -- # rather like CVS tags. cvs2git turns CVS tags into Git # lightweight tags, so we should make sure they become # Mercurial tags. But we don't have to fake a history for # them; save them up for the end. - self.lightweight_tags.append((cmd.ref, cmd.from_)) + tag = cmd.ref[len(tagprefix):] + self.tags.append((tag, cmd.from_)) def tag_handler(self, cmd): pass - def write_lightweight_tags(self): - if not self.lightweight_tags: # avoid writing empty .hgtags - return - - # XXX what about duplicate tags? lightweight_tags is - # deliberately a list, to preserve order ... but do we need to - # worry about repeated tags? (Certainly not for cvs2git output, - # since CVS has no tag history.) - - # Create Mercurial tags from git-style "lightweight tags" in the - # input stream. - self.ui.status("updating tags\n") - mercurial.hg.clean(self.repo, self.repo.lookup("default")) - tagfile = open(self.repo.wjoin(".hgtags"), "ab") - for (ref, mark) in self.lightweight_tags: - tag = ref[len("refs/tags/"):] - rev = self.mark_map[mark] - node = self.repo.changelog.node(rev) - tagfile.write("%s %s\n" % (hex(node), tag)) - tagfile.close() - - files = [".hgtags"] - self.repo.rawcommit( - files=files, text="update tags", user="convert-repo", date=None) class HgImportCommitHandler(processor.CommitHandler): - def __init__(self, parent, command, ui, repo, **opts): + def __init__(self, parent, command, ui): self.parent = parent # HgImportProcessor running the show - self.command = command + self.command = command # CommitCommand that we're processing self.ui = ui - self.repo = repo - self.opts = opts - self.files = set() + + # Files changes by this commit as a list of (filename, id) + # tuples where id is (commitid, blobid). The blobid is + # needed to fetch the file's contents later, and the commitid + # is needed to fetch the mode. + # (XXX what about inline file contents?) + # (XXX how to describe deleted files?) + self.modified = [] - def _make_container(self, path): - if '/' in path: - d = os.path.dirname(path) - if not os.path.isdir(d): - os.makedirs(d) + # mode of files listed in self.modified: '', 'x', or 'l' + self.mode = {} + + # dictionary of src: dest (renamed files are in here and self.modified) + self.copies = {} + + # number of inline files seen in this commit + self.inlinecount = 0 def modify_handler(self, filecmd): - #print "============================" + filecmd.path - # FIXME: handle mode - self.files.add(filecmd.path) - fullpath = os.path.join(self.repo.root, filecmd.path) - self._make_container(fullpath) - #print "made dirs, writing file" if filecmd.dataref: - # reference to a blob that has already appeared in the stream - fn = self.parent.getblobfilename(filecmd.dataref) - if os.path.exists(fullpath): - os.remove(fullpath) - try: - os.link(fn, fullpath) - except OSError, err: - if err.errno == errno.ENOENT: - # if this happens, it's a problem in the fast-import - # stream - raise util.Abort("bad blob ref %r (no such file %s)" - % (filecmd.dataref, fn)) - else: - # anything else is a bug in this extension - # (cross-device move, permissions, etc.) - raise - elif filecmd.data: - f = open(fullpath, "w") - f.write(filecmd.data) - f.close() + blobid = filecmd.dataref # blobid is the mark of the blob else: - raise RuntimeError("either filecmd.dataref or filecmd.data must be set") - #print self.repo.add([filecmd.path]) - #print "Done:", filecmd.path + blobid = "%s-inline:%d" % (self.command.id, self.inlinecount) + assert filecmd.data is not None + self.parent.writeblob(blobid, filecmd.data) + self.inlinecount += 1 + + fileid = (self.command.id, blobid) + + self.modified.append((filecmd.path, fileid)) + if filecmd.mode.endswith("644"): # normal file + mode = '' + elif filecmd.mode.endswith("755"): # executable + mode = 'x' + elif filecmd.mode == "120000": # symlink + mode = 'l' + else: + raise RuntimeError("mode %r unsupported" % filecmd.mode) + + self.mode[filecmd.path] = mode def delete_handler(self, filecmd): - self.files.add(filecmd.path) - self.repo.remove([filecmd.path], unlink=True) + self.modified.append((filecmd.path, None)) - #def copy_handler(self, filecmd): - # self.files.add(filecmd.path) - # """Handle a filecopy command.""" - # self.ui.write("Cmd: %s\n" % repr(filecmd)) + def copy_handler(self, filecmd): + self.copies[filecmd.src_path] = filecmd.dest_path - #def rename_handler(self, filecmd): - # self.files.add(filecmd.path) - # """Handle a filerename command.""" - # self.ui.write("Cmd: %s\n" % repr(filecmd)) - - def filelist(self): - return list(self.files) + def rename_handler(self, filecmd): + # copy oldname to newname and delete oldname + self.copies[filecmd.oldname] = filecmd.newname + self.files.append((filecmd.path, None))
