comparison hgfastimport/hgimport.py @ 47:7ff36dc9f0b1

Massive rework to use infrastructure provided by convert extension. fastimport no longer stages changes in the repository's working copy; instead, it now works like any other convert source: the imported history is kept in memory (except for file contents) and then processed by the 'convert' extension.
author Greg Ward <greg-hg@gerg.ca>
date Sat, 16 May 2009 12:57:22 -0400
parents 93c2b1e832bd
children 1cf21a8c274b
comparison
equal deleted inserted replaced
46:93c2b1e832bd 47:7ff36dc9f0b1
19 This module provides core processing functionality including an abstract class 19 This module provides core processing functionality including an abstract class
20 for basing real processors on. See the processors package for examples. 20 for basing real processors on. See the processors package for examples.
21 """ 21 """
22 22
23 import os 23 import os
24 import os.path
25 import errno
26 import shutil 24 import shutil
27 25
28 import mercurial.hg 26 from hgext.convert import common
29 import mercurial.commands 27
30 from mercurial import util 28 from fastimport import processor, parser
31 from mercurial.node import nullrev, hex 29
32 30 class fastimport_source(common.converter_source):
33 from fastimport import processor 31 """Interface between the fastimport processor below and Mercurial's
34 from hgfastimport import hgechoprocessor 32 normal conversion infrastructure.
33 """
34 def __init__(self, ui, repo, sources):
35 self.ui = ui
36 self.sources = sources
37 self.processor = HgImportProcessor(ui, repo)
38 self.parsed = False
39
40 # converter_source methods
41
42 def before(self):
43 self.processor.setup()
44
45 def after(self):
46 self.processor.teardown()
47
48 def getheads(self):
49 """Return a list of this repository's heads"""
50 self._parse()
51 allheads = []
52 for branchheads in self.processor.branchmap.values():
53 allheads.extend(branchheads)
54 return allheads
55
56 def getfile(self, name, fileid):
57 """Return file contents as a string. rev is the identifier returned
58 by a previous call to getchanges().
59 """
60 if fileid is None: # deleted file
61 raise IOError
62 return self.processor.getblob(fileid)
63
64 def getmode(self, name, fileid):
65 """Return file mode, eg. '', 'x', or 'l'. rev is the identifier
66 returned by a previous call to getchanges().
67 """
68 return self.processor.getmode(name, fileid)
69
70 def getchanges(self, commitid):
71 """Returns a tuple of (files, copies).
72
73 files is a sorted list of (filename, id) tuples for all files
74 changed between commitid and its first parent returned by
75 getcommit(). id is the source revision id of the file.
76
77 copies is a dictionary of dest: source
78 """
79 return (self.processor.modified[commitid],
80 self.processor.copies[commitid])
81
82 def getcommit(self, commitid):
83 """Return the commit object for commitid"""
84 if commitid is None:
85 return None
86 else:
87 return self.processor.commitmap[commitid]
88
89 def gettags(self):
90 """Return the tags as a dictionary of name: revision"""
91 return dict(self.processor.tags)
92
93 def getchangedfiles(self, rev, i):
94 """Return the files changed by rev compared to parent[i].
95
96 i is an index selecting one of the parents of rev. The return
97 value should be the list of files that are different in rev and
98 this parent.
99
100 If rev has no parents, i is None.
101
102 This function is only needed to support --filemap
103 """
104 raise NotImplementedError()
105
106 # private worker methods
107
108 def _parse(self):
109 if self.parsed:
110 return
111
112 for source in self.sources:
113 self.ui.debug("reading fastimport source: %s\n" % source)
114 f = open(source)
115 p = parser.ImportParser(f)
116 self.processor.process(p.iter_commands)
117 f.close()
118
119 self.parsed = True
35 120
36 class HgImportProcessor(processor.ImportProcessor): 121 class HgImportProcessor(processor.ImportProcessor):
37 122
38 def __init__(self, ui, repo, **opts): 123 def __init__(self, ui, repo):
39 super(HgImportProcessor, self).__init__() 124 super(HgImportProcessor, self).__init__()
40 self.ui = ui 125 self.ui = ui
41 self.repo = repo 126 self.repo = repo
42 self.opts = opts 127
43 self.last_commit = None # CommitCommand object 128 self.commitmap = {} # map commit ID (":1") to commit object
44 self.mark_map = {} # map mark (e.g. ":1") to revision number 129 self.branchmap = {} # map branch name to list of heads
45 self.branch_map = {} # map git branch name to revision number 130
46 self.lightweight_tags = [] # list of (ref, mark) tuples 131 # see HgImportCommitHandler for details on these three
132 self.modified = {} # map commit id to list of file mods
133 self.filemodes = {} # map commit id to {filename: mode} map
134 self.copies = {} # map commit id to dict of file copies
135
136 self.tags = [] # list of (tag, mark) tuples
47 137
48 self.numblobs = 0 # for progress reporting 138 self.numblobs = 0 # for progress reporting
49 self.blobdir = None 139 self.blobdir = None
50 140
51 def setup(self): 141 def setup(self):
52 """Setup before processing any streams.""" 142 """Setup before processing any streams."""
53 pass 143 pass
54 144
55 def teardown(self): 145 def teardown(self):
56 """Cleanup after processing all streams.""" 146 """Cleanup after processing all streams."""
57 # Hmmm: this isn't really a cleanup step, it's a post-processing
58 # step. But we currently have one processor per input
59 # stream... despite the fact that state like mark_map,
60 # branch_map, and lightweight_tags really should span input
61 # streams.
62 self.write_lightweight_tags()
63
64 if self.blobdir and os.path.exists(self.blobdir): 147 if self.blobdir and os.path.exists(self.blobdir):
65 self.ui.status("Removing blob dir %r ...\n" % self.blobdir) 148 self.ui.status("Removing blob dir %r ...\n" % self.blobdir)
66 shutil.rmtree(self.blobdir) 149 shutil.rmtree(self.blobdir)
67 150
68 def progress_handler(self, cmd): 151 def progress_handler(self, cmd):
69 self.ui.write("Progress: %s\n" % cmd.message) 152 self.ui.write("Progress: %s\n" % cmd.message)
70 153
71 def blob_handler(self, cmd): 154 def blob_handler(self, cmd):
72 if self.blobdir is None: # no blobs seen yet 155 self.writeblob(cmd.id, cmd.data)
73 # XXX cleanup? 156
74 self.blobdir = os.path.join(self.repo.root, ".hg", "blobs") 157 def _getblobfilename(self, blobid):
75 os.mkdir(self.blobdir)
76
77 fn = self.getblobfilename(cmd.id)
78 blobfile = open(fn, "wb")
79 #self.ui.debug("writing blob %s to %s (%d bytes)\n"
80 # % (cmd.id, fn, len(cmd.data)))
81 blobfile.write(cmd.data)
82 blobfile.close()
83
84 self.numblobs += 1
85 if self.numblobs % 500 == 0:
86 self.ui.status("%d blobs read\n" % self.numblobs)
87
88 def getblobfilename(self, blobid):
89 if self.blobdir is None: 158 if self.blobdir is None:
90 raise RuntimeError("no blobs seen, so no blob directory created") 159 raise RuntimeError("no blobs seen, so no blob directory created")
91 # XXX should escape ":" for windows 160 # XXX should escape ":" for windows
92 return os.path.join(self.blobdir, "blob-" + blobid) 161 return os.path.join(self.blobdir, "blob-" + blobid)
93 162
163 def getblob(self, fileid):
164 (commitid, blobid) = fileid
165 f = open(self._getblobfilename(blobid), "rb")
166 try:
167 return f.read()
168 finally:
169 f.close()
170
171 def writeblob(self, blobid, data):
172 if self.blobdir is None: # no blobs seen yet
173 self.blobdir = os.path.join(self.repo.root, ".hg", "blobs")
174 os.mkdir(self.blobdir)
175
176 fn = self._getblobfilename(blobid)
177 blobfile = open(fn, "wb")
178 #self.ui.debug("writing blob %s to %s (%d bytes)\n"
179 # % (blobid, fn, len(data)))
180 blobfile.write(data)
181 blobfile.close()
182
183 self.numblobs += 1
184 if self.numblobs % 500 == 0:
185 self.ui.status("%d blobs read\n" % self.numblobs)
186
187 def getmode(self, name, fileid):
188 (commitid, blobid) = fileid
189 return self.filemodes[commitid][name]
190
94 def checkpoint_handler(self, cmd): 191 def checkpoint_handler(self, cmd):
95 # This command means nothing to us 192 # This command means nothing to us
96 pass 193 pass
97 194
98 def committish_rev(self, committish): 195 def _getcommit(self, committish):
196 """Given a mark reference or a branch name, return the
197 appropriate commit object. Return None if committish is a
198 branch with no commits. Raises KeyError if anything else is out
199 of whack.
200 """
99 if committish.startswith(":"): 201 if committish.startswith(":"):
100 return self.mark_map[committish] 202 # KeyError here indicates the input stream is broken.
101 else: 203 return self.commitmap[committish]
102 return self.branch_map[committish] 204 else:
103 205 branch = self._getbranch(committish)
206 if branch is None:
207 raise ValueError("invalid committish: %r" % committish)
208
209 heads = self.branchmap.get(branch)
210 if heads is None:
211 return None
212 else:
213 # KeyError here indicates bad commit id in self.branchmap.
214 return self.commitmap[heads[-1]]
215
216 def _getbranch(self, ref):
217 """Translate a Git head ref to corresponding Mercurial branch
218 name. E.g. \"refs/heads/foo\" is translated to \"foo\".
219 Special case: \"refs/heads/master\" becomes \"default\". If
220 'ref' is not a head ref, return None.
221 """
222 prefix = "refs/heads/"
223 if ref.startswith(prefix):
224 branch = ref[len(prefix):]
225 if branch == "master":
226 return "default"
227 else:
228 return branch
229 else:
230 return None
231
104 def commit_handler(self, cmd): 232 def commit_handler(self, cmd):
105 # XXX this assumes the fixup branch name used by cvs2git. In 233 # XXX this assumes the fixup branch name used by cvs2git. In
106 # contrast, git-fast-import(1) recommends "TAG_FIXUP" (not under 234 # contrast, git-fast-import(1) recommends "TAG_FIXUP" (not under
107 # refs/heads), and implies that it can be called whatever the 235 # refs/heads), and implies that it can be called whatever the
108 # creator of the fastimport dump wants to call it. So the name 236 # creator of the fastimport dump wants to call it. So the name
109 # of the fixup branch should be configurable! 237 # of the fixup branch should be configurable!
110 fixup = (cmd.ref == "refs/heads/TAG.FIXUP") 238 fixup = (cmd.ref == "refs/heads/TAG.FIXUP")
111 239
112 if cmd.from_: 240 if cmd.from_:
113 first_parent = self.committish_rev(cmd.from_) 241 first_parent = cmd.from_
114 else: 242 else:
115 first_parent = self.branch_map.get(cmd.ref, nullrev) 243 first_parent = self._getcommit(cmd.ref) # commit object
244 if first_parent is not None:
245 first_parent = first_parent.rev # commit id
246
116 if cmd.merges: 247 if cmd.merges:
117 if len(cmd.merges) > 1: 248 if len(cmd.merges) > 1:
118 raise NotImplementedError("Can't handle more than two parents") 249 raise NotImplementedError("Can't handle more than two parents")
119 second_parent = self.committish_rev(cmd.merges[0]) 250 second_parent = cmd.merges[0]
120 else: 251 else:
121 second_parent = nullrev 252 second_parent = None
122 253
123 if first_parent is nullrev and second_parent is not nullrev: 254 if first_parent is None and second_parent is not None:
124 # First commit on a new branch that has 'merge' but no 'from': 255 # First commit on a new branch that has 'merge' but no 'from':
125 # special case meaning branch starts with no files; the contents of 256 # special case meaning branch starts with no files; the contents of
126 # the first commit (this one) determine the list of files at branch 257 # the first commit (this one) determine the list of files at branch
127 # time. 258 # time.
128 first_parent = second_parent 259 first_parent = second_parent
129 second_parent = nullrev 260 second_parent = None
130 no_files = True # XXX not handled 261 no_files = True # XXX this is ignored...
131 262
132 self.ui.debug("commit %s: first_parent = %r, second_parent = %r\n" 263 self.ui.debug("commit %s: first_parent = %r, second_parent = %r\n"
133 % (cmd.id, first_parent, second_parent)) 264 % (cmd, first_parent, second_parent))
134 assert ((first_parent != second_parent) or 265 assert ((first_parent != second_parent) or
135 (first_parent == second_parent == -1)), \ 266 (first_parent is second_parent is None)), \
136 ("commit %s: first_parent == second parent = %r" 267 ("commit %s: first_parent == second parent = %r"
137 % (cmd.id, first_parent)) 268 % (cmd, first_parent))
138 269
139 # Update to the first parent 270 # Figure out the Mercurial branch name.
140 mercurial.hg.clean(self.repo, self.repo.lookup(first_parent)) 271 if fixup and first_parent is not None:
141 mercurial.commands.debugsetparents( 272 # If this is a fixup commit, pretend it happened on the same
142 self.ui, self.repo, first_parent, second_parent) 273 # branch as its first parent. (We don't want a Mercurial
143 274 # named branch called "TAG.FIXUP" in the output repository.)
144 if cmd.ref == "refs/heads/master": 275 branch = self.commitmap[first_parent].branch
145 branch = "default" 276 else:
146 elif fixup and first_parent is not nullrev: 277 branch = self._getbranch(cmd.ref)
147 # If this is a fixup commit, pretend it happened on the same branch 278
148 # as its first parent. (We don't want a Mercurial named branch
149 # called "TAG.FIXUP" in the output repository.)
150 branch = self.repo.changectx(first_parent).branch()
151 else:
152 branch = cmd.ref[len("refs/heads/"):]
153 self.repo.dirstate.setbranch(branch)
154 commit_handler = HgImportCommitHandler( 279 commit_handler = HgImportCommitHandler(
155 self, cmd, self.ui, self.repo, **self.opts) 280 self, cmd, self.ui)
156 commit_handler.process() 281 commit_handler.process()
282 self.modified[cmd.id] = commit_handler.modified
283 self.filemodes[cmd.id] = commit_handler.mode
284 self.copies[cmd.id] = commit_handler.copies
157 285
158 # in case we are converting from git or bzr, prefer author but 286 # in case we are converting from git or bzr, prefer author but
159 # fallback to committer (committer is required, author is 287 # fallback to committer (committer is required, author is
160 # optional) 288 # optional)
161 userinfo = cmd.author or cmd.committer 289 userinfo = cmd.author or cmd.committer
162 user = "%s <%s>" % (userinfo[0], userinfo[1]) 290 if userinfo[0] == userinfo[1]:
163 291 # In order to conform to fastimport syntax, cvs2git with no
164 # Blech: have to monkeypatch mercurial.encoding to ensure that 292 # authormap produces author names like "jsmith <jsmith>"; if
165 # everything under rawcommit() assumes the same encoding, 293 # we see that, revert to plain old "jsmith".
166 # regardless of current locale. 294 user = userinfo[0]
167 from mercurial import encoding 295 else:
168 encoding.encoding = "UTF-8" 296 user = "%s <%s>" % (userinfo[0], userinfo[1])
169 297
170 files = commit_handler.filelist()
171 assert type(cmd.message) is unicode 298 assert type(cmd.message) is unicode
172 text = cmd.message.encode("utf-8") # XXX cmd.message is unicode 299 text = cmd.message.encode("utf-8")
173 date = self.convert_date(userinfo) 300 date = self.convert_date(userinfo)
174 node = self.repo.rawcommit( 301
175 files=files, text=text, user=user, date=date) 302 parents = filter(None, [first_parent, second_parent])
176 rev = self.repo.changelog.rev(node) 303 commit = common.commit(user, date, text, parents, branch, rev=cmd.id)
177 if cmd.mark is not None: 304
178 self.mark_map[":" + cmd.mark] = rev 305 self.commitmap[cmd.id] = commit
179 if not fixup: 306 heads = self.branchmap.get(branch)
180 self.branch_map[cmd.ref] = rev 307 if heads is None:
181 self.last_commit = cmd 308 heads = [cmd.id]
182 self.ui.write("Done commit of rev %d\n" % rev) 309 else:
310 # adding to an existing branch: replace the previous head
311 try:
312 heads.remove(first_parent)
313 except ValueError: # first parent not a head: no problem
314 pass
315 heads.append(cmd.id) # at end means this is tipmost
316 self.branchmap[branch] = heads
317 self.ui.debug("processed commit %s\n" % cmd)
183 318
184 def convert_date(self, c): 319 def convert_date(self, c):
185 res = (int(c[2]), int(c[3])) 320 res = (int(c[2]), int(c[3]))
186 #print c, res 321 #print c, res
187 #print type((0, 0)), type(res), len(res), type(res) is type((0, 0)) 322 #print type((0, 0)), type(res), len(res), type(res) is type((0, 0))
189 # print "go for it" 324 # print "go for it"
190 #return res 325 #return res
191 return "%d %d" % res 326 return "%d %d" % res
192 327
193 def reset_handler(self, cmd): 328 def reset_handler(self, cmd):
194 if cmd.ref.startswith("refs/heads/"): 329 tagprefix = "refs/tags/"
330 branch = self._getbranch(cmd.ref)
331 if branch:
195 # The usual case for 'reset': (re)create the named branch. 332 # The usual case for 'reset': (re)create the named branch.
196 # XXX what should we do if cmd.from_ is None? 333 # XXX what should we do if cmd.from_ is None?
197 if cmd.from_ is not None: 334 if cmd.from_ is not None:
198 self.branch_map[cmd.ref] = self.committish_rev(cmd.from_) 335 self.branchmap[branch] = [cmd.from_]
199 else: 336 else:
200 # pretend the branch never existed... is this right?!? 337 # pretend the branch never existed... is this right?!?
201 try: 338 try:
202 del self.branch_map[cmd.ref] 339 del self.branchmap[branch]
203 except KeyError: 340 except KeyError:
204 pass 341 pass
205 #else: 342 #else:
206 # # XXX filename? line number? 343 # # XXX filename? line number?
207 # self.ui.warn("ignoring branch reset with no 'from'\n") 344 # self.ui.warn("ignoring branch reset with no 'from'\n")
208 elif cmd.ref.startswith("refs/tags/"): 345 elif cmd.ref.startswith(tagprefix):
209 # Create a "lightweight tag" in Git terms. As I understand 346 # Create a "lightweight tag" in Git terms. As I understand
210 # it, that's a tag with no description and no history -- 347 # it, that's a tag with no description and no history --
211 # rather like CVS tags. cvs2git turns CVS tags into Git 348 # rather like CVS tags. cvs2git turns CVS tags into Git
212 # lightweight tags, so we should make sure they become 349 # lightweight tags, so we should make sure they become
213 # Mercurial tags. But we don't have to fake a history for 350 # Mercurial tags. But we don't have to fake a history for
214 # them; save them up for the end. 351 # them; save them up for the end.
215 self.lightweight_tags.append((cmd.ref, cmd.from_)) 352 tag = cmd.ref[len(tagprefix):]
353 self.tags.append((tag, cmd.from_))
216 354
217 def tag_handler(self, cmd): 355 def tag_handler(self, cmd):
218 pass 356 pass
219 357
220 def write_lightweight_tags(self):
221 if not self.lightweight_tags: # avoid writing empty .hgtags
222 return
223
224 # XXX what about duplicate tags? lightweight_tags is
225 # deliberately a list, to preserve order ... but do we need to
226 # worry about repeated tags? (Certainly not for cvs2git output,
227 # since CVS has no tag history.)
228
229 # Create Mercurial tags from git-style "lightweight tags" in the
230 # input stream.
231 self.ui.status("updating tags\n")
232 mercurial.hg.clean(self.repo, self.repo.lookup("default"))
233 tagfile = open(self.repo.wjoin(".hgtags"), "ab")
234 for (ref, mark) in self.lightweight_tags:
235 tag = ref[len("refs/tags/"):]
236 rev = self.mark_map[mark]
237 node = self.repo.changelog.node(rev)
238 tagfile.write("%s %s\n" % (hex(node), tag))
239 tagfile.close()
240
241 files = [".hgtags"]
242 self.repo.rawcommit(
243 files=files, text="update tags", user="convert-repo", date=None)
244 358
245 class HgImportCommitHandler(processor.CommitHandler): 359 class HgImportCommitHandler(processor.CommitHandler):
246 360
247 def __init__(self, parent, command, ui, repo, **opts): 361 def __init__(self, parent, command, ui):
248 self.parent = parent # HgImportProcessor running the show 362 self.parent = parent # HgImportProcessor running the show
249 self.command = command 363 self.command = command # CommitCommand that we're processing
250 self.ui = ui 364 self.ui = ui
251 self.repo = repo 365
252 self.opts = opts 366 # Files changes by this commit as a list of (filename, id)
253 self.files = set() 367 # tuples where id is (commitid, blobid). The blobid is
254 368 # needed to fetch the file's contents later, and the commitid
255 def _make_container(self, path): 369 # is needed to fetch the mode.
256 if '/' in path: 370 # (XXX what about inline file contents?)
257 d = os.path.dirname(path) 371 # (XXX how to describe deleted files?)
258 if not os.path.isdir(d): 372 self.modified = []
259 os.makedirs(d) 373
374 # mode of files listed in self.modified: '', 'x', or 'l'
375 self.mode = {}
376
377 # dictionary of src: dest (renamed files are in here and self.modified)
378 self.copies = {}
379
380 # number of inline files seen in this commit
381 self.inlinecount = 0
260 382
261 def modify_handler(self, filecmd): 383 def modify_handler(self, filecmd):
262 #print "============================" + filecmd.path
263 # FIXME: handle mode
264 self.files.add(filecmd.path)
265 fullpath = os.path.join(self.repo.root, filecmd.path)
266 self._make_container(fullpath)
267 #print "made dirs, writing file"
268 if filecmd.dataref: 384 if filecmd.dataref:
269 # reference to a blob that has already appeared in the stream 385 blobid = filecmd.dataref # blobid is the mark of the blob
270 fn = self.parent.getblobfilename(filecmd.dataref) 386 else:
271 if os.path.exists(fullpath): 387 blobid = "%s-inline:%d" % (self.command.id, self.inlinecount)
272 os.remove(fullpath) 388 assert filecmd.data is not None
273 try: 389 self.parent.writeblob(blobid, filecmd.data)
274 os.link(fn, fullpath) 390 self.inlinecount += 1
275 except OSError, err: 391
276 if err.errno == errno.ENOENT: 392 fileid = (self.command.id, blobid)
277 # if this happens, it's a problem in the fast-import 393
278 # stream 394 self.modified.append((filecmd.path, fileid))
279 raise util.Abort("bad blob ref %r (no such file %s)" 395 if filecmd.mode.endswith("644"): # normal file
280 % (filecmd.dataref, fn)) 396 mode = ''
281 else: 397 elif filecmd.mode.endswith("755"): # executable
282 # anything else is a bug in this extension 398 mode = 'x'
283 # (cross-device move, permissions, etc.) 399 elif filecmd.mode == "120000": # symlink
284 raise 400 mode = 'l'
285 elif filecmd.data: 401 else:
286 f = open(fullpath, "w") 402 raise RuntimeError("mode %r unsupported" % filecmd.mode)
287 f.write(filecmd.data) 403
288 f.close() 404 self.mode[filecmd.path] = mode
289 else:
290 raise RuntimeError("either filecmd.dataref or filecmd.data must be set")
291 #print self.repo.add([filecmd.path])
292 #print "Done:", filecmd.path
293 405
294 def delete_handler(self, filecmd): 406 def delete_handler(self, filecmd):
295 self.files.add(filecmd.path) 407 self.modified.append((filecmd.path, None))
296 self.repo.remove([filecmd.path], unlink=True) 408
297 409 def copy_handler(self, filecmd):
298 #def copy_handler(self, filecmd): 410 self.copies[filecmd.src_path] = filecmd.dest_path
299 # self.files.add(filecmd.path) 411
300 # """Handle a filecopy command.""" 412 def rename_handler(self, filecmd):
301 # self.ui.write("Cmd: %s\n" % repr(filecmd)) 413 # copy oldname to newname and delete oldname
302 414 self.copies[filecmd.oldname] = filecmd.newname
303 #def rename_handler(self, filecmd): 415 self.files.append((filecmd.path, None))
304 # self.files.add(filecmd.path)
305 # """Handle a filerename command."""
306 # self.ui.write("Cmd: %s\n" % repr(filecmd))
307
308 def filelist(self):
309 return list(self.files)