|
86
|
1 # Copyright (C) 2008-2010 Canonical Ltd |
|
|
2 # |
|
|
3 # This program is free software; you can redistribute it and/or modify |
|
|
4 # it under the terms of the GNU General Public License as published by |
|
|
5 # the Free Software Foundation; either version 2 of the License, or |
|
|
6 # (at your option) any later version. |
|
|
7 # |
|
|
8 # This program is distributed in the hope that it will be useful, |
|
|
9 # but WITHOUT ANY WARRANTY; without even the implied warranty of |
|
|
10 # MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the |
|
|
11 # GNU General Public License for more details. |
|
|
12 # |
|
|
13 # You should have received a copy of the GNU General Public License |
|
|
14 # along with this program. If not, see <http://www.gnu.org/licenses/>. |
|
|
15 |
|
|
16 """Parser of import data into command objects. |
|
|
17 |
|
|
18 In order to reuse existing front-ends, the stream format is a subset of |
|
|
19 the one used by git-fast-import (as of the 1.5.4 release of git at least). |
|
|
20 The grammar is: |
|
|
21 |
|
|
22 stream ::= cmd*; |
|
|
23 |
|
|
24 cmd ::= new_blob |
|
|
25 | new_commit |
|
|
26 | new_tag |
|
|
27 | reset_branch |
|
|
28 | checkpoint |
|
|
29 | progress |
|
|
30 ; |
|
|
31 |
|
|
32 new_blob ::= 'blob' lf |
|
|
33 mark? |
|
|
34 file_content; |
|
|
35 file_content ::= data; |
|
|
36 |
|
|
37 new_commit ::= 'commit' sp ref_str lf |
|
|
38 mark? |
|
|
39 ('author' sp name '<' email '>' when lf)? |
|
|
40 'committer' sp name '<' email '>' when lf |
|
|
41 commit_msg |
|
|
42 ('from' sp (ref_str | hexsha1 | sha1exp_str | idnum) lf)? |
|
|
43 ('merge' sp (ref_str | hexsha1 | sha1exp_str | idnum) lf)* |
|
|
44 file_change* |
|
|
45 lf?; |
|
|
46 commit_msg ::= data; |
|
|
47 |
|
|
48 file_change ::= file_clr |
|
|
49 | file_del |
|
|
50 | file_rnm |
|
|
51 | file_cpy |
|
|
52 | file_obm |
|
|
53 | file_inm; |
|
|
54 file_clr ::= 'deleteall' lf; |
|
|
55 file_del ::= 'D' sp path_str lf; |
|
|
56 file_rnm ::= 'R' sp path_str sp path_str lf; |
|
|
57 file_cpy ::= 'C' sp path_str sp path_str lf; |
|
|
58 file_obm ::= 'M' sp mode sp (hexsha1 | idnum) sp path_str lf; |
|
|
59 file_inm ::= 'M' sp mode sp 'inline' sp path_str lf |
|
|
60 data; |
|
|
61 |
|
|
62 new_tag ::= 'tag' sp tag_str lf |
|
|
63 'from' sp (ref_str | hexsha1 | sha1exp_str | idnum) lf |
|
|
64 'tagger' sp name '<' email '>' when lf |
|
|
65 tag_msg; |
|
|
66 tag_msg ::= data; |
|
|
67 |
|
|
68 reset_branch ::= 'reset' sp ref_str lf |
|
|
69 ('from' sp (ref_str | hexsha1 | sha1exp_str | idnum) lf)? |
|
|
70 lf?; |
|
|
71 |
|
|
72 checkpoint ::= 'checkpoint' lf |
|
|
73 lf?; |
|
|
74 |
|
|
75 progress ::= 'progress' sp not_lf* lf |
|
|
76 lf?; |
|
|
77 |
|
|
78 # note: the first idnum in a stream should be 1 and subsequent |
|
|
79 # idnums should not have gaps between values as this will cause |
|
|
80 # the stream parser to reserve space for the gapped values. An |
|
|
81 # idnum can be updated in the future to a new object by issuing |
|
|
82 # a new mark directive with the old idnum. |
|
|
83 # |
|
|
84 mark ::= 'mark' sp idnum lf; |
|
|
85 data ::= (delimited_data | exact_data) |
|
|
86 lf?; |
|
|
87 |
|
|
88 # note: delim may be any string but must not contain lf. |
|
|
89 # data_line may contain any data but must not be exactly |
|
|
90 # delim. The lf after the final data_line is included in |
|
|
91 # the data. |
|
|
92 delimited_data ::= 'data' sp '<<' delim lf |
|
|
93 (data_line lf)* |
|
|
94 delim lf; |
|
|
95 |
|
|
96 # note: declen indicates the length of binary_data in bytes. |
|
|
97 # declen does not include the lf preceeding the binary data. |
|
|
98 # |
|
|
99 exact_data ::= 'data' sp declen lf |
|
|
100 binary_data; |
|
|
101 |
|
|
102 # note: quoted strings are C-style quoting supporting \c for |
|
|
103 # common escapes of 'c' (e..g \n, \t, \\, \") or \nnn where nnn |
|
|
104 # is the signed byte value in octal. Note that the only |
|
|
105 # characters which must actually be escaped to protect the |
|
|
106 # stream formatting is: \, " and LF. Otherwise these values |
|
|
107 # are UTF8. |
|
|
108 # |
|
|
109 ref_str ::= ref; |
|
|
110 sha1exp_str ::= sha1exp; |
|
|
111 tag_str ::= tag; |
|
|
112 path_str ::= path | '"' quoted(path) '"' ; |
|
|
113 mode ::= '100644' | '644' |
|
|
114 | '100755' | '755' |
|
|
115 | '120000' |
|
|
116 ; |
|
|
117 |
|
|
118 declen ::= # unsigned 32 bit value, ascii base10 notation; |
|
|
119 bigint ::= # unsigned integer value, ascii base10 notation; |
|
|
120 binary_data ::= # file content, not interpreted; |
|
|
121 |
|
|
122 when ::= raw_when | rfc2822_when; |
|
|
123 raw_when ::= ts sp tz; |
|
|
124 rfc2822_when ::= # Valid RFC 2822 date and time; |
|
|
125 |
|
|
126 sp ::= # ASCII space character; |
|
|
127 lf ::= # ASCII newline (LF) character; |
|
|
128 |
|
|
129 # note: a colon (':') must precede the numerical value assigned to |
|
|
130 # an idnum. This is to distinguish it from a ref or tag name as |
|
|
131 # GIT does not permit ':' in ref or tag strings. |
|
|
132 # |
|
|
133 idnum ::= ':' bigint; |
|
|
134 path ::= # GIT style file path, e.g. "a/b/c"; |
|
|
135 ref ::= # GIT ref name, e.g. "refs/heads/MOZ_GECKO_EXPERIMENT"; |
|
|
136 tag ::= # GIT tag name, e.g. "FIREFOX_1_5"; |
|
|
137 sha1exp ::= # Any valid GIT SHA1 expression; |
|
|
138 hexsha1 ::= # SHA1 in hexadecimal format; |
|
|
139 |
|
|
140 # note: name and email are UTF8 strings, however name must not |
|
|
141 # contain '<' or lf and email must not contain any of the |
|
|
142 # following: '<', '>', lf. |
|
|
143 # |
|
|
144 name ::= # valid GIT author/committer name; |
|
|
145 email ::= # valid GIT author/committer email; |
|
|
146 ts ::= # time since the epoch in seconds, ascii base10 notation; |
|
|
147 tz ::= # GIT style timezone; |
|
|
148 |
|
|
149 # note: comments may appear anywhere in the input, except |
|
|
150 # within a data command. Any form of the data command |
|
|
151 # always escapes the related input from comment processing. |
|
|
152 # |
|
|
153 # In case it is not clear, the '#' that starts the comment |
|
|
154 # must be the first character on that the line (an lf have |
|
|
155 # preceeded it). |
|
|
156 # |
|
|
157 comment ::= '#' not_lf* lf; |
|
|
158 not_lf ::= # Any byte that is not ASCII newline (LF); |
|
|
159 """ |
|
|
160 from __future__ import print_function |
|
|
161 |
|
|
162 import collections |
|
|
163 import re |
|
|
164 import sys |
|
|
165 import codecs |
|
|
166 |
|
|
167 from fastimport import ( |
|
|
168 commands, |
|
|
169 dates, |
|
|
170 errors, |
|
|
171 ) |
|
|
172 from fastimport.helpers import ( |
|
|
173 newobject as object, |
|
|
174 utf8_bytes_string, |
|
|
175 ) |
|
|
176 |
|
|
177 |
|
|
178 ## Stream parsing ## |
|
|
179 |
|
|
180 class LineBasedParser(object): |
|
|
181 |
|
|
182 def __init__(self, input_stream): |
|
|
183 """A Parser that keeps track of line numbers. |
|
|
184 |
|
|
185 :param input: the file-like object to read from |
|
|
186 """ |
|
|
187 self.input = input_stream |
|
|
188 self.lineno = 0 |
|
|
189 # Lines pushed back onto the input stream |
|
|
190 self._buffer = [] |
|
|
191 |
|
|
192 def abort(self, exception, *args): |
|
|
193 """Raise an exception providing line number information.""" |
|
|
194 raise exception(self.lineno, *args) |
|
|
195 |
|
|
196 def readline(self): |
|
|
197 """Get the next line including the newline or '' on EOF.""" |
|
|
198 self.lineno += 1 |
|
|
199 if self._buffer: |
|
|
200 return self._buffer.pop() |
|
|
201 else: |
|
|
202 return self.input.readline() |
|
|
203 |
|
|
204 def next_line(self): |
|
|
205 """Get the next line without the newline or None on EOF.""" |
|
|
206 line = self.readline() |
|
|
207 if line: |
|
|
208 return line[:-1] |
|
|
209 else: |
|
|
210 return None |
|
|
211 |
|
|
212 def push_line(self, line): |
|
|
213 """Push line back onto the line buffer. |
|
|
214 |
|
|
215 :param line: the line with no trailing newline |
|
|
216 """ |
|
|
217 self.lineno -= 1 |
|
|
218 self._buffer.append(line + b'\n') |
|
|
219 |
|
|
220 def read_bytes(self, count): |
|
|
221 """Read a given number of bytes from the input stream. |
|
|
222 |
|
|
223 Throws MissingBytes if the bytes are not found. |
|
|
224 |
|
|
225 Note: This method does not read from the line buffer. |
|
|
226 |
|
|
227 :return: a string |
|
|
228 """ |
|
|
229 result = self.input.read(count) |
|
|
230 found = len(result) |
|
|
231 self.lineno += result.count(b'\n') |
|
|
232 if found != count: |
|
|
233 self.abort(errors.MissingBytes, count, found) |
|
|
234 return result |
|
|
235 |
|
|
236 def read_until(self, terminator): |
|
|
237 """Read the input stream until the terminator is found. |
|
|
238 |
|
|
239 Throws MissingTerminator if the terminator is not found. |
|
|
240 |
|
|
241 Note: This method does not read from the line buffer. |
|
|
242 |
|
|
243 :return: the bytes read up to but excluding the terminator. |
|
|
244 """ |
|
|
245 |
|
|
246 lines = [] |
|
|
247 term = terminator + b'\n' |
|
|
248 while True: |
|
|
249 line = self.input.readline() |
|
|
250 if line == term: |
|
|
251 break |
|
|
252 else: |
|
|
253 lines.append(line) |
|
|
254 return b''.join(lines) |
|
|
255 |
|
|
256 |
|
|
257 # Regular expression used for parsing. (Note: The spec states that the name |
|
|
258 # part should be non-empty but git-fast-export doesn't always do that so |
|
|
259 # the first bit is \w*, not \w+.) Also git-fast-import code says the |
|
|
260 # space before the email is optional. |
|
|
261 _WHO_AND_WHEN_RE = re.compile(br'([^<]*)<(.*)> (.+)') |
|
|
262 _WHO_RE = re.compile(br'([^<]*)<(.*)>') |
|
|
263 |
|
|
264 |
|
|
265 class ImportParser(LineBasedParser): |
|
|
266 |
|
|
267 def __init__(self, input_stream, verbose=False, output=sys.stdout, |
|
|
268 user_mapper=None, strict=True): |
|
|
269 """A Parser of import commands. |
|
|
270 |
|
|
271 :param input_stream: the file-like object to read from |
|
|
272 :param verbose: display extra information of not |
|
|
273 :param output: the file-like object to write messages to (YAGNI?) |
|
|
274 :param user_mapper: if not None, the UserMapper used to adjust |
|
|
275 user-ids for authors, committers and taggers. |
|
|
276 :param strict: Raise errors on strictly invalid data |
|
|
277 """ |
|
|
278 LineBasedParser.__init__(self, input_stream) |
|
|
279 self.verbose = verbose |
|
|
280 self.output = output |
|
|
281 self.user_mapper = user_mapper |
|
|
282 self.strict = strict |
|
|
283 # We auto-detect the date format when a date is first encountered |
|
|
284 self.date_parser = None |
|
|
285 self.features = {} |
|
|
286 |
|
|
287 def warning(self, msg): |
|
|
288 sys.stderr.write("warning line %d: %s\n" % (self.lineno, msg)) |
|
|
289 |
|
|
290 def iter_commands(self): |
|
|
291 """Iterator returning ImportCommand objects.""" |
|
|
292 while True: |
|
|
293 line = self.next_line() |
|
|
294 if line is None: |
|
|
295 if b'done' in self.features: |
|
|
296 raise errors.PrematureEndOfStream(self.lineno) |
|
|
297 break |
|
|
298 elif len(line) == 0 or line.startswith(b'#'): |
|
|
299 continue |
|
|
300 # Search for commands in order of likelihood |
|
|
301 elif line.startswith(b'commit '): |
|
|
302 yield self._parse_commit(line[len(b'commit '):]) |
|
|
303 elif line.startswith(b'blob'): |
|
|
304 yield self._parse_blob() |
|
|
305 elif line.startswith(b'done'): |
|
|
306 break |
|
|
307 elif line.startswith(b'progress '): |
|
|
308 yield commands.ProgressCommand(line[len(b'progress '):]) |
|
|
309 elif line.startswith(b'reset '): |
|
|
310 yield self._parse_reset(line[len(b'reset '):]) |
|
|
311 elif line.startswith(b'tag '): |
|
|
312 yield self._parse_tag(line[len(b'tag '):]) |
|
|
313 elif line.startswith(b'checkpoint'): |
|
|
314 yield commands.CheckpointCommand() |
|
|
315 elif line.startswith(b'feature'): |
|
|
316 yield self._parse_feature(line[len(b'feature '):]) |
|
|
317 else: |
|
|
318 self.abort(errors.InvalidCommand, line) |
|
|
319 |
|
|
320 def iter_file_commands(self): |
|
|
321 """Iterator returning FileCommand objects. |
|
|
322 |
|
|
323 If an invalid file command is found, the line is silently |
|
|
324 pushed back and iteration ends. |
|
|
325 """ |
|
|
326 while True: |
|
|
327 line = self.next_line() |
|
|
328 if line is None: |
|
|
329 break |
|
|
330 elif len(line) == 0 or line.startswith(b'#'): |
|
|
331 continue |
|
|
332 # Search for file commands in order of likelihood |
|
|
333 elif line.startswith(b'M '): |
|
|
334 yield self._parse_file_modify(line[2:]) |
|
|
335 elif line.startswith(b'D '): |
|
|
336 path = self._path(line[2:]) |
|
|
337 yield commands.FileDeleteCommand(path) |
|
|
338 elif line.startswith(b'R '): |
|
|
339 old, new = self._path_pair(line[2:]) |
|
|
340 yield commands.FileRenameCommand(old, new) |
|
|
341 elif line.startswith(b'C '): |
|
|
342 src, dest = self._path_pair(line[2:]) |
|
|
343 yield commands.FileCopyCommand(src, dest) |
|
|
344 elif line.startswith(b'deleteall'): |
|
|
345 yield commands.FileDeleteAllCommand() |
|
|
346 else: |
|
|
347 self.push_line(line) |
|
|
348 break |
|
|
349 |
|
|
350 def _parse_blob(self): |
|
|
351 """Parse a blob command.""" |
|
|
352 lineno = self.lineno |
|
|
353 mark = self._get_mark_if_any() |
|
|
354 data = self._get_data(b'blob') |
|
|
355 return commands.BlobCommand(mark, data, lineno) |
|
|
356 |
|
|
357 def _parse_commit(self, ref): |
|
|
358 """Parse a commit command.""" |
|
|
359 lineno = self.lineno |
|
|
360 mark = self._get_mark_if_any() |
|
|
361 author = self._get_user_info(b'commit', b'author', False) |
|
|
362 more_authors = [] |
|
|
363 while True: |
|
|
364 another_author = self._get_user_info(b'commit', b'author', False) |
|
|
365 if another_author is not None: |
|
|
366 more_authors.append(another_author) |
|
|
367 else: |
|
|
368 break |
|
|
369 committer = self._get_user_info(b'commit', b'committer') |
|
|
370 message = self._get_data(b'commit', b'message') |
|
|
371 from_ = self._get_from() |
|
|
372 merges = [] |
|
|
373 while True: |
|
|
374 merge = self._get_merge() |
|
|
375 if merge is not None: |
|
|
376 # while the spec suggests it's illegal, git-fast-export |
|
|
377 # outputs multiple merges on the one line, e.g. |
|
|
378 # merge :x :y :z |
|
|
379 these_merges = merge.split(b' ') |
|
|
380 merges.extend(these_merges) |
|
|
381 else: |
|
|
382 break |
|
|
383 properties = {} |
|
|
384 while True: |
|
|
385 name_value = self._get_property() |
|
|
386 if name_value is not None: |
|
|
387 name, value = name_value |
|
|
388 properties[name] = value |
|
|
389 else: |
|
|
390 break |
|
|
391 return commands.CommitCommand(ref, mark, author, committer, message, |
|
|
392 from_, merges, list(self.iter_file_commands()), lineno=lineno, |
|
|
393 more_authors=more_authors, properties=properties) |
|
|
394 |
|
|
395 def _parse_feature(self, info): |
|
|
396 """Parse a feature command.""" |
|
|
397 parts = info.split(b'=', 1) |
|
|
398 name = parts[0] |
|
|
399 if len(parts) > 1: |
|
|
400 value = self._path(parts[1]) |
|
|
401 else: |
|
|
402 value = None |
|
|
403 self.features[name] = value |
|
|
404 return commands.FeatureCommand(name, value, lineno=self.lineno) |
|
|
405 |
|
|
406 def _parse_file_modify(self, info): |
|
|
407 """Parse a filemodify command within a commit. |
|
|
408 |
|
|
409 :param info: a string in the format "mode dataref path" |
|
|
410 (where dataref might be the hard-coded literal 'inline'). |
|
|
411 """ |
|
|
412 params = info.split(b' ', 2) |
|
|
413 path = self._path(params[2]) |
|
|
414 mode = self._mode(params[0]) |
|
|
415 if params[1] == b'inline': |
|
|
416 dataref = None |
|
|
417 data = self._get_data(b'filemodify') |
|
|
418 else: |
|
|
419 dataref = params[1] |
|
|
420 data = None |
|
|
421 return commands.FileModifyCommand(path, mode, dataref, |
|
|
422 data) |
|
|
423 |
|
|
424 def _parse_reset(self, ref): |
|
|
425 """Parse a reset command.""" |
|
|
426 from_ = self._get_from() |
|
|
427 return commands.ResetCommand(ref, from_) |
|
|
428 |
|
|
429 def _parse_tag(self, name): |
|
|
430 """Parse a tag command.""" |
|
|
431 from_ = self._get_from(b'tag') |
|
|
432 tagger = self._get_user_info(b'tag', b'tagger', |
|
|
433 accept_just_who=True) |
|
|
434 message = self._get_data(b'tag', b'message') |
|
|
435 return commands.TagCommand(name, from_, tagger, message) |
|
|
436 |
|
|
437 def _get_mark_if_any(self): |
|
|
438 """Parse a mark section.""" |
|
|
439 line = self.next_line() |
|
|
440 if line.startswith(b'mark :'): |
|
|
441 return line[len(b'mark :'):] |
|
|
442 else: |
|
|
443 self.push_line(line) |
|
|
444 return None |
|
|
445 |
|
|
446 def _get_from(self, required_for=None): |
|
|
447 """Parse a from section.""" |
|
|
448 line = self.next_line() |
|
|
449 if line is None: |
|
|
450 return None |
|
|
451 elif line.startswith(b'from '): |
|
|
452 return line[len(b'from '):] |
|
|
453 elif required_for: |
|
|
454 self.abort(errors.MissingSection, required_for, 'from') |
|
|
455 else: |
|
|
456 self.push_line(line) |
|
|
457 return None |
|
|
458 |
|
|
459 def _get_merge(self): |
|
|
460 """Parse a merge section.""" |
|
|
461 line = self.next_line() |
|
|
462 if line is None: |
|
|
463 return None |
|
|
464 elif line.startswith(b'merge '): |
|
|
465 return line[len(b'merge '):] |
|
|
466 else: |
|
|
467 self.push_line(line) |
|
|
468 return None |
|
|
469 |
|
|
470 def _get_property(self): |
|
|
471 """Parse a property section.""" |
|
|
472 line = self.next_line() |
|
|
473 if line is None: |
|
|
474 return None |
|
|
475 elif line.startswith(b'property '): |
|
|
476 return self._name_value(line[len(b'property '):]) |
|
|
477 else: |
|
|
478 self.push_line(line) |
|
|
479 return None |
|
|
480 |
|
|
481 def _get_user_info(self, cmd, section, required=True, |
|
|
482 accept_just_who=False): |
|
|
483 """Parse a user section.""" |
|
|
484 line = self.next_line() |
|
|
485 if line.startswith(section + b' '): |
|
|
486 return self._who_when(line[len(section + b' '):], cmd, section, |
|
|
487 accept_just_who=accept_just_who) |
|
|
488 elif required: |
|
|
489 self.abort(errors.MissingSection, cmd, section) |
|
|
490 else: |
|
|
491 self.push_line(line) |
|
|
492 return None |
|
|
493 |
|
|
494 def _get_data(self, required_for, section=b'data'): |
|
|
495 """Parse a data section.""" |
|
|
496 line = self.next_line() |
|
|
497 if line.startswith(b'data '): |
|
|
498 rest = line[len(b'data '):] |
|
|
499 if rest.startswith(b'<<'): |
|
|
500 return self.read_until(rest[2:]) |
|
|
501 else: |
|
|
502 size = int(rest) |
|
|
503 read_bytes = self.read_bytes(size) |
|
|
504 # optional LF after data. |
|
|
505 next_line = self.input.readline() |
|
|
506 self.lineno += 1 |
|
|
507 if len(next_line) > 1 or next_line != b'\n': |
|
|
508 self.push_line(next_line[:-1]) |
|
|
509 return read_bytes |
|
|
510 else: |
|
|
511 self.abort(errors.MissingSection, required_for, section) |
|
|
512 |
|
|
513 def _who_when(self, s, cmd, section, accept_just_who=False): |
|
|
514 """Parse who and when information from a string. |
|
|
515 |
|
|
516 :return: a tuple of (name,email,timestamp,timezone). name may be |
|
|
517 the empty string if only an email address was given. |
|
|
518 """ |
|
|
519 match = _WHO_AND_WHEN_RE.search(s) |
|
|
520 if match: |
|
|
521 datestr = match.group(3).lstrip() |
|
|
522 if self.date_parser is None: |
|
|
523 # auto-detect the date format |
|
|
524 if len(datestr.split(b' ')) == 2: |
|
|
525 date_format = 'raw' |
|
|
526 elif datestr == b'now': |
|
|
527 date_format = 'now' |
|
|
528 else: |
|
|
529 date_format = 'rfc2822' |
|
|
530 self.date_parser = dates.DATE_PARSERS_BY_NAME[date_format] |
|
|
531 try: |
|
|
532 when = self.date_parser(datestr, self.lineno) |
|
|
533 except ValueError: |
|
|
534 print("failed to parse datestr '%s'" % (datestr,)) |
|
|
535 raise |
|
|
536 name = match.group(1).rstrip() |
|
|
537 email = match.group(2) |
|
|
538 else: |
|
|
539 match = _WHO_RE.search(s) |
|
|
540 if accept_just_who and match: |
|
|
541 # HACK around missing time |
|
|
542 # TODO: output a warning here |
|
|
543 when = dates.DATE_PARSERS_BY_NAME['now']('now') |
|
|
544 name = match.group(1) |
|
|
545 email = match.group(2) |
|
|
546 elif self.strict: |
|
|
547 self.abort(errors.BadFormat, cmd, section, s) |
|
|
548 else: |
|
|
549 name = s |
|
|
550 email = None |
|
|
551 when = dates.DATE_PARSERS_BY_NAME['now']('now') |
|
|
552 if len(name) > 0: |
|
|
553 if name.endswith(b' '): |
|
|
554 name = name[:-1] |
|
|
555 # While it shouldn't happen, some datasets have email addresses |
|
|
556 # which contain unicode characters. See bug 338186. We sanitize |
|
|
557 # the data at this level just in case. |
|
|
558 if self.user_mapper: |
|
|
559 name, email = self.user_mapper.map_name_and_email(name, email) |
|
|
560 |
|
|
561 return Authorship(name, email, when[0], when[1]) |
|
|
562 |
|
|
563 def _name_value(self, s): |
|
|
564 """Parse a (name,value) tuple from 'name value-length value'.""" |
|
|
565 parts = s.split(b' ', 2) |
|
|
566 name = parts[0] |
|
|
567 if len(parts) == 1: |
|
|
568 value = None |
|
|
569 else: |
|
|
570 size = int(parts[1]) |
|
|
571 value = parts[2] |
|
|
572 still_to_read = size - len(value) |
|
|
573 if still_to_read > 0: |
|
|
574 read_bytes = self.read_bytes(still_to_read) |
|
|
575 value += b'\n' + read_bytes[:still_to_read - 1] |
|
|
576 return (name, value) |
|
|
577 |
|
|
578 def _path(self, s): |
|
|
579 """Parse a path.""" |
|
|
580 if s.startswith(b'"'): |
|
|
581 if not s.endswith(b'"'): |
|
|
582 self.abort(errors.BadFormat, '?', '?', s) |
|
|
583 else: |
|
|
584 return _unquote_c_string(s[1:-1]) |
|
|
585 return s |
|
|
586 |
|
|
587 def _path_pair(self, s): |
|
|
588 """Parse two paths separated by a space.""" |
|
|
589 # TODO: handle a space in the first path |
|
|
590 if s.startswith(b'"'): |
|
|
591 parts = s[1:].split(b'" ', 1) |
|
|
592 else: |
|
|
593 parts = s.split(b' ', 1) |
|
|
594 if len(parts) != 2: |
|
|
595 self.abort(errors.BadFormat, '?', '?', s) |
|
|
596 elif parts[1].startswith(b'"') and parts[1].endswith(b'"'): |
|
|
597 parts[1] = parts[1][1:-1] |
|
|
598 elif parts[1].startswith(b'"') or parts[1].endswith(b'"'): |
|
|
599 self.abort(errors.BadFormat, '?', '?', s) |
|
|
600 return [_unquote_c_string(s) for s in parts] |
|
|
601 |
|
|
602 def _mode(self, s): |
|
|
603 """Check file mode format and parse into an int. |
|
|
604 |
|
|
605 :return: mode as integer |
|
|
606 """ |
|
|
607 # Note: Output from git-fast-export slightly different to spec |
|
|
608 if s in [b'644', b'100644', b'0100644']: |
|
|
609 return 0o100644 |
|
|
610 elif s in [b'755', b'100755', b'0100755']: |
|
|
611 return 0o100755 |
|
|
612 elif s in [b'040000', b'0040000']: |
|
|
613 return 0o40000 |
|
|
614 elif s in [b'120000', b'0120000']: |
|
|
615 return 0o120000 |
|
|
616 elif s in [b'160000', b'0160000']: |
|
|
617 return 0o160000 |
|
|
618 else: |
|
|
619 self.abort(errors.BadFormat, 'filemodify', 'mode', s) |
|
|
620 |
|
|
621 |
|
|
622 ESCAPE_SEQUENCE_BYTES_RE = re.compile(br''' |
|
|
623 ( \\U........ # 8-digit hex escapes |
|
|
624 | \\u.... # 4-digit hex escapes |
|
|
625 | \\x.. # 2-digit hex escapes |
|
|
626 | \\[0-7]{1,3} # Octal escapes |
|
|
627 | \\N\{[^}]+\} # Unicode characters by name |
|
|
628 | \\[\\'"abfnrtv] # Single-character escapes |
|
|
629 )''', re.VERBOSE |
|
|
630 ) |
|
|
631 |
|
|
632 ESCAPE_SEQUENCE_RE = re.compile(r''' |
|
|
633 ( \\U........ |
|
|
634 | \\u.... |
|
|
635 | \\x.. |
|
|
636 | \\[0-7]{1,3} |
|
|
637 | \\N\{[^}]+\} |
|
|
638 | \\[\\'"abfnrtv] |
|
|
639 )''', re.UNICODE | re.VERBOSE |
|
|
640 ) |
|
|
641 |
|
|
642 def _unquote_c_string(s): |
|
|
643 """replace C-style escape sequences (\n, \", etc.) with real chars.""" |
|
|
644 |
|
|
645 # doing a s.encode('utf-8').decode('unicode_escape') can return an |
|
|
646 # incorrect output with unicode string (both in py2 and py3) the safest way |
|
|
647 # is to match the escape sequences and decoding them alone. |
|
|
648 def decode_match(match): |
|
|
649 return utf8_bytes_string( |
|
|
650 codecs.decode(match.group(0), 'unicode-escape') |
|
|
651 ) |
|
|
652 |
|
|
653 if sys.version_info[0] >= 3 and isinstance(s, bytes): |
|
|
654 return ESCAPE_SEQUENCE_BYTES_RE.sub(decode_match, s) |
|
|
655 else: |
|
|
656 return ESCAPE_SEQUENCE_RE.sub(decode_match, s) |
|
|
657 |
|
|
658 |
|
|
659 Authorship = collections.namedtuple('Authorship', 'name email timestamp timezone') |