|
86
|
1 # Copyright (C) 2008 Canonical Ltd |
|
|
2 # |
|
|
3 # This program is free software; you can redistribute it and/or modify |
|
|
4 # it under the terms of the GNU General Public License as published by |
|
|
5 # the Free Software Foundation; either version 2 of the License, or |
|
|
6 # (at your option) any later version. |
|
|
7 # |
|
|
8 # This program is distributed in the hope that it will be useful, |
|
|
9 # but WITHOUT ANY WARRANTY; without even the implied warranty of |
|
|
10 # MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the |
|
|
11 # GNU General Public License for more details. |
|
|
12 # |
|
|
13 # You should have received a copy of the GNU General Public License |
|
|
14 # along with this program. If not, see <http://www.gnu.org/licenses/>. |
|
|
15 |
|
|
16 """Import processor that dump stats about the input (and doesn't import).""" |
|
|
17 |
|
|
18 from __future__ import absolute_import |
|
|
19 |
|
|
20 from .. import ( |
|
|
21 reftracker, |
|
|
22 ) |
|
|
23 from ..helpers import ( |
|
|
24 invert_dict, |
|
|
25 invert_dictset, |
|
|
26 ) |
|
|
27 from fastimport import ( |
|
|
28 commands, |
|
|
29 processor, |
|
|
30 ) |
|
|
31 import stat |
|
|
32 |
|
|
33 |
|
|
34 class InfoProcessor(processor.ImportProcessor): |
|
|
35 """An import processor that dumps statistics about the input. |
|
|
36 |
|
|
37 No changes to the current repository are made. |
|
|
38 |
|
|
39 As well as providing useful information about an import |
|
|
40 stream before importing it, this processor is useful for |
|
|
41 benchmarking the speed at which data can be extracted from |
|
|
42 the source. |
|
|
43 """ |
|
|
44 |
|
|
45 def __init__(self, params=None, verbose=0, outf=None): |
|
|
46 processor.ImportProcessor.__init__(self, params, verbose, |
|
|
47 outf=outf) |
|
|
48 |
|
|
49 def pre_process(self): |
|
|
50 # Init statistics |
|
|
51 self.cmd_counts = {} |
|
|
52 for cmd in commands.COMMAND_NAMES: |
|
|
53 self.cmd_counts[cmd] = 0 |
|
|
54 self.file_cmd_counts = {} |
|
|
55 for fc in commands.FILE_COMMAND_NAMES: |
|
|
56 self.file_cmd_counts[fc] = 0 |
|
|
57 self.parent_counts = {} |
|
|
58 self.max_parent_count = 0 |
|
|
59 self.committers = set() |
|
|
60 self.separate_authors_found = False |
|
|
61 self.symlinks_found = False |
|
|
62 self.executables_found = False |
|
|
63 self.sha_blob_references = False |
|
|
64 self.lightweight_tags = 0 |
|
|
65 # Blob usage tracking |
|
|
66 self.blobs = {} |
|
|
67 for usage in ['new', 'used', 'unknown', 'unmarked']: |
|
|
68 self.blobs[usage] = set() |
|
|
69 self.blob_ref_counts = {} |
|
|
70 # Head tracking |
|
|
71 self.reftracker = reftracker.RefTracker() |
|
|
72 # Stuff to cache: a map from mark to # of times that mark is merged |
|
|
73 self.merges = {} |
|
|
74 # Stuff to cache: these are maps from mark to sets |
|
|
75 self.rename_old_paths = {} |
|
|
76 self.copy_source_paths = {} |
|
|
77 |
|
|
78 def post_process(self): |
|
|
79 # Dump statistics |
|
|
80 cmd_names = commands.COMMAND_NAMES |
|
|
81 fc_names = commands.FILE_COMMAND_NAMES |
|
|
82 self._dump_stats_group("Command counts", |
|
|
83 [(c.decode('utf-8'), self.cmd_counts[c]) for c in cmd_names], str) |
|
|
84 self._dump_stats_group("File command counts", |
|
|
85 [(c.decode('utf-8'), self.file_cmd_counts[c]) for c in fc_names], str) |
|
|
86 |
|
|
87 # Commit stats |
|
|
88 if self.cmd_counts[b'commit']: |
|
|
89 p_items = [] |
|
|
90 for i in range(self.max_parent_count + 1): |
|
|
91 if i in self.parent_counts: |
|
|
92 count = self.parent_counts[i] |
|
|
93 p_items.append(("parents-%d" % i, count)) |
|
|
94 merges_count = len(self.merges) |
|
|
95 p_items.append(('total revisions merged', merges_count)) |
|
|
96 flags = { |
|
|
97 'separate authors found': self.separate_authors_found, |
|
|
98 'executables': self.executables_found, |
|
|
99 'symlinks': self.symlinks_found, |
|
|
100 'blobs referenced by SHA': self.sha_blob_references, |
|
|
101 } |
|
|
102 self._dump_stats_group("Parent counts", p_items, str) |
|
|
103 self._dump_stats_group("Commit analysis", sorted(flags.items()), _found) |
|
|
104 heads = invert_dictset(self.reftracker.heads) |
|
|
105 self._dump_stats_group( |
|
|
106 "Head analysis", |
|
|
107 [(k.decode('utf-8'), |
|
|
108 ', '.join([m.decode('utf-8') for m in v])) |
|
|
109 for (k, v) in heads.items()], None, |
|
|
110 _iterable_as_config_list) |
|
|
111 # note("\t%d\t%s" % (len(self.committers), 'unique committers')) |
|
|
112 self._dump_stats_group("Merges", self.merges.items(), None) |
|
|
113 # We only show the rename old path and copy source paths when -vv |
|
|
114 # (verbose=2) is specified. The output here for mysql's data can't |
|
|
115 # be parsed currently so this bit of code needs more work anyhow .. |
|
|
116 if self.verbose >= 2: |
|
|
117 self._dump_stats_group("Rename old paths", |
|
|
118 self.rename_old_paths.items(), len, |
|
|
119 _iterable_as_config_list) |
|
|
120 self._dump_stats_group("Copy source paths", |
|
|
121 self.copy_source_paths.items(), len, |
|
|
122 _iterable_as_config_list) |
|
|
123 |
|
|
124 # Blob stats |
|
|
125 if self.cmd_counts[b'blob']: |
|
|
126 # In verbose mode, don't list every blob used |
|
|
127 if self.verbose: |
|
|
128 del self.blobs['used'] |
|
|
129 self._dump_stats_group("Blob usage tracking", |
|
|
130 self.blobs.items(), len, _iterable_as_config_list) |
|
|
131 if self.blob_ref_counts: |
|
|
132 blobs_by_count = invert_dict(self.blob_ref_counts) |
|
|
133 blob_items = sorted(blobs_by_count.items()) |
|
|
134 self._dump_stats_group("Blob reference counts", |
|
|
135 blob_items, len, _iterable_as_config_list) |
|
|
136 |
|
|
137 # Other stats |
|
|
138 if self.cmd_counts[b'reset']: |
|
|
139 reset_stats = { |
|
|
140 'lightweight tags': self.lightweight_tags, |
|
|
141 } |
|
|
142 self._dump_stats_group("Reset analysis", reset_stats.items()) |
|
|
143 |
|
|
144 def _dump_stats_group(self, title, items, normal_formatter=None, |
|
|
145 verbose_formatter=None): |
|
|
146 """Dump a statistics group. |
|
|
147 |
|
|
148 In verbose mode, do so as a config file so |
|
|
149 that other processors can load the information if they want to. |
|
|
150 :param normal_formatter: the callable to apply to the value |
|
|
151 before displaying it in normal mode |
|
|
152 :param verbose_formatter: the callable to apply to the value |
|
|
153 before displaying it in verbose mode |
|
|
154 """ |
|
|
155 if self.verbose: |
|
|
156 self.outf.write("[%s]\n" % (title,)) |
|
|
157 for name, value in items: |
|
|
158 if verbose_formatter is not None: |
|
|
159 value = verbose_formatter(value) |
|
|
160 if type(name) == str: |
|
|
161 name = name.replace(' ', '-') |
|
|
162 self.outf.write("%s = %s\n" % (name, value)) |
|
|
163 self.outf.write("\n") |
|
|
164 else: |
|
|
165 self.outf.write("%s:\n" % (title,)) |
|
|
166 for name, value in items: |
|
|
167 if normal_formatter is not None: |
|
|
168 value = normal_formatter(value) |
|
|
169 self.outf.write("\t%s\t%s\n" % (value, name)) |
|
|
170 |
|
|
171 def progress_handler(self, cmd): |
|
|
172 """Process a ProgressCommand.""" |
|
|
173 self.cmd_counts[cmd.name] += 1 |
|
|
174 |
|
|
175 def blob_handler(self, cmd): |
|
|
176 """Process a BlobCommand.""" |
|
|
177 self.cmd_counts[cmd.name] += 1 |
|
|
178 if cmd.mark is None: |
|
|
179 self.blobs['unmarked'].add(cmd.id) |
|
|
180 else: |
|
|
181 self.blobs['new'].add(cmd.id) |
|
|
182 # Marks can be re-used so remove it from used if already there. |
|
|
183 # Note: we definitely do NOT want to remove it from multi if |
|
|
184 # it's already in that set. |
|
|
185 try: |
|
|
186 self.blobs['used'].remove(cmd.id) |
|
|
187 except KeyError: |
|
|
188 pass |
|
|
189 |
|
|
190 def checkpoint_handler(self, cmd): |
|
|
191 """Process a CheckpointCommand.""" |
|
|
192 self.cmd_counts[cmd.name] += 1 |
|
|
193 |
|
|
194 def commit_handler(self, cmd): |
|
|
195 """Process a CommitCommand.""" |
|
|
196 self.cmd_counts[cmd.name] += 1 |
|
|
197 self.committers.add(cmd.committer) |
|
|
198 if cmd.author is not None: |
|
|
199 self.separate_authors_found = True |
|
|
200 for fc in cmd.iter_files(): |
|
|
201 self.file_cmd_counts[fc.name] += 1 |
|
|
202 if isinstance(fc, commands.FileModifyCommand): |
|
|
203 if fc.mode & 0o111: |
|
|
204 self.executables_found = True |
|
|
205 if stat.S_ISLNK(fc.mode): |
|
|
206 self.symlinks_found = True |
|
|
207 if fc.dataref is not None: |
|
|
208 if fc.dataref[0] == ':': |
|
|
209 self._track_blob(fc.dataref) |
|
|
210 else: |
|
|
211 self.sha_blob_references = True |
|
|
212 elif isinstance(fc, commands.FileRenameCommand): |
|
|
213 self.rename_old_paths.setdefault(cmd.id, set()).add(fc.old_path) |
|
|
214 elif isinstance(fc, commands.FileCopyCommand): |
|
|
215 self.copy_source_paths.setdefault(cmd.id, set()).add(fc.src_path) |
|
|
216 |
|
|
217 # Track the heads |
|
|
218 parents = self.reftracker.track_heads(cmd) |
|
|
219 |
|
|
220 # Track the parent counts |
|
|
221 parent_count = len(parents) |
|
|
222 try: |
|
|
223 self.parent_counts[parent_count] += 1 |
|
|
224 except KeyError: |
|
|
225 self.parent_counts[parent_count] = 1 |
|
|
226 if parent_count > self.max_parent_count: |
|
|
227 self.max_parent_count = parent_count |
|
|
228 |
|
|
229 # Remember the merges |
|
|
230 if cmd.merges: |
|
|
231 #self.merges.setdefault(cmd.ref, set()).update(cmd.merges) |
|
|
232 for merge in cmd.merges: |
|
|
233 if merge in self.merges: |
|
|
234 self.merges[merge] += 1 |
|
|
235 else: |
|
|
236 self.merges[merge] = 1 |
|
|
237 |
|
|
238 def reset_handler(self, cmd): |
|
|
239 """Process a ResetCommand.""" |
|
|
240 self.cmd_counts[cmd.name] += 1 |
|
|
241 if cmd.ref.startswith('refs/tags/'): |
|
|
242 self.lightweight_tags += 1 |
|
|
243 else: |
|
|
244 if cmd.from_ is not None: |
|
|
245 self.reftracker.track_heads_for_ref( |
|
|
246 cmd.ref, cmd.from_) |
|
|
247 |
|
|
248 def tag_handler(self, cmd): |
|
|
249 """Process a TagCommand.""" |
|
|
250 self.cmd_counts[cmd.name] += 1 |
|
|
251 |
|
|
252 def feature_handler(self, cmd): |
|
|
253 """Process a FeatureCommand.""" |
|
|
254 self.cmd_counts[cmd.name] += 1 |
|
|
255 feature = cmd.feature_name |
|
|
256 if feature not in commands.FEATURE_NAMES: |
|
|
257 self.warning("feature %s is not supported - parsing may fail" |
|
|
258 % (feature,)) |
|
|
259 |
|
|
260 def _track_blob(self, mark): |
|
|
261 if mark in self.blob_ref_counts: |
|
|
262 self.blob_ref_counts[mark] += 1 |
|
|
263 pass |
|
|
264 elif mark in self.blobs['used']: |
|
|
265 self.blob_ref_counts[mark] = 2 |
|
|
266 self.blobs['used'].remove(mark) |
|
|
267 elif mark in self.blobs['new']: |
|
|
268 self.blobs['used'].add(mark) |
|
|
269 self.blobs['new'].remove(mark) |
|
|
270 else: |
|
|
271 self.blobs['unknown'].add(mark) |
|
|
272 |
|
|
273 def _found(b): |
|
|
274 """Format a found boolean as a string.""" |
|
|
275 return ['no', 'found'][b] |
|
|
276 |
|
|
277 def _iterable_as_config_list(s): |
|
|
278 """Format an iterable as a sequence of comma-separated strings. |
|
|
279 |
|
|
280 To match what ConfigObj expects, a single item list has a trailing comma. |
|
|
281 """ |
|
|
282 items = sorted(s) |
|
|
283 if len(items) == 1: |
|
|
284 return "%s," % (items[0],) |
|
|
285 else: |
|
|
286 return ", ".join(items) |