comparison hgext3rd/fastimport/vendor/python_fastimport/processors/info_processor.py @ 86:28704a2a7461 vendor/python-fastimport

Import python-fastimport-0.9.8
author Roy Marples <roy@marples.name>
date Tue, 19 Jan 2021 22:56:34 +0000
parents
children 2fc99e3479d9
comparison
equal deleted inserted replaced
85:1f5544a8870b 86:28704a2a7461
1 # Copyright (C) 2008 Canonical Ltd
2 #
3 # This program is free software; you can redistribute it and/or modify
4 # it under the terms of the GNU General Public License as published by
5 # the Free Software Foundation; either version 2 of the License, or
6 # (at your option) any later version.
7 #
8 # This program is distributed in the hope that it will be useful,
9 # but WITHOUT ANY WARRANTY; without even the implied warranty of
10 # MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
11 # GNU General Public License for more details.
12 #
13 # You should have received a copy of the GNU General Public License
14 # along with this program. If not, see <http://www.gnu.org/licenses/>.
15
16 """Import processor that dump stats about the input (and doesn't import)."""
17
18 from __future__ import absolute_import
19
20 from .. import (
21 reftracker,
22 )
23 from ..helpers import (
24 invert_dict,
25 invert_dictset,
26 )
27 from fastimport import (
28 commands,
29 processor,
30 )
31 import stat
32
33
34 class InfoProcessor(processor.ImportProcessor):
35 """An import processor that dumps statistics about the input.
36
37 No changes to the current repository are made.
38
39 As well as providing useful information about an import
40 stream before importing it, this processor is useful for
41 benchmarking the speed at which data can be extracted from
42 the source.
43 """
44
45 def __init__(self, params=None, verbose=0, outf=None):
46 processor.ImportProcessor.__init__(self, params, verbose,
47 outf=outf)
48
49 def pre_process(self):
50 # Init statistics
51 self.cmd_counts = {}
52 for cmd in commands.COMMAND_NAMES:
53 self.cmd_counts[cmd] = 0
54 self.file_cmd_counts = {}
55 for fc in commands.FILE_COMMAND_NAMES:
56 self.file_cmd_counts[fc] = 0
57 self.parent_counts = {}
58 self.max_parent_count = 0
59 self.committers = set()
60 self.separate_authors_found = False
61 self.symlinks_found = False
62 self.executables_found = False
63 self.sha_blob_references = False
64 self.lightweight_tags = 0
65 # Blob usage tracking
66 self.blobs = {}
67 for usage in ['new', 'used', 'unknown', 'unmarked']:
68 self.blobs[usage] = set()
69 self.blob_ref_counts = {}
70 # Head tracking
71 self.reftracker = reftracker.RefTracker()
72 # Stuff to cache: a map from mark to # of times that mark is merged
73 self.merges = {}
74 # Stuff to cache: these are maps from mark to sets
75 self.rename_old_paths = {}
76 self.copy_source_paths = {}
77
78 def post_process(self):
79 # Dump statistics
80 cmd_names = commands.COMMAND_NAMES
81 fc_names = commands.FILE_COMMAND_NAMES
82 self._dump_stats_group("Command counts",
83 [(c.decode('utf-8'), self.cmd_counts[c]) for c in cmd_names], str)
84 self._dump_stats_group("File command counts",
85 [(c.decode('utf-8'), self.file_cmd_counts[c]) for c in fc_names], str)
86
87 # Commit stats
88 if self.cmd_counts[b'commit']:
89 p_items = []
90 for i in range(self.max_parent_count + 1):
91 if i in self.parent_counts:
92 count = self.parent_counts[i]
93 p_items.append(("parents-%d" % i, count))
94 merges_count = len(self.merges)
95 p_items.append(('total revisions merged', merges_count))
96 flags = {
97 'separate authors found': self.separate_authors_found,
98 'executables': self.executables_found,
99 'symlinks': self.symlinks_found,
100 'blobs referenced by SHA': self.sha_blob_references,
101 }
102 self._dump_stats_group("Parent counts", p_items, str)
103 self._dump_stats_group("Commit analysis", sorted(flags.items()), _found)
104 heads = invert_dictset(self.reftracker.heads)
105 self._dump_stats_group(
106 "Head analysis",
107 [(k.decode('utf-8'),
108 ', '.join([m.decode('utf-8') for m in v]))
109 for (k, v) in heads.items()], None,
110 _iterable_as_config_list)
111 # note("\t%d\t%s" % (len(self.committers), 'unique committers'))
112 self._dump_stats_group("Merges", self.merges.items(), None)
113 # We only show the rename old path and copy source paths when -vv
114 # (verbose=2) is specified. The output here for mysql's data can't
115 # be parsed currently so this bit of code needs more work anyhow ..
116 if self.verbose >= 2:
117 self._dump_stats_group("Rename old paths",
118 self.rename_old_paths.items(), len,
119 _iterable_as_config_list)
120 self._dump_stats_group("Copy source paths",
121 self.copy_source_paths.items(), len,
122 _iterable_as_config_list)
123
124 # Blob stats
125 if self.cmd_counts[b'blob']:
126 # In verbose mode, don't list every blob used
127 if self.verbose:
128 del self.blobs['used']
129 self._dump_stats_group("Blob usage tracking",
130 self.blobs.items(), len, _iterable_as_config_list)
131 if self.blob_ref_counts:
132 blobs_by_count = invert_dict(self.blob_ref_counts)
133 blob_items = sorted(blobs_by_count.items())
134 self._dump_stats_group("Blob reference counts",
135 blob_items, len, _iterable_as_config_list)
136
137 # Other stats
138 if self.cmd_counts[b'reset']:
139 reset_stats = {
140 'lightweight tags': self.lightweight_tags,
141 }
142 self._dump_stats_group("Reset analysis", reset_stats.items())
143
144 def _dump_stats_group(self, title, items, normal_formatter=None,
145 verbose_formatter=None):
146 """Dump a statistics group.
147
148 In verbose mode, do so as a config file so
149 that other processors can load the information if they want to.
150 :param normal_formatter: the callable to apply to the value
151 before displaying it in normal mode
152 :param verbose_formatter: the callable to apply to the value
153 before displaying it in verbose mode
154 """
155 if self.verbose:
156 self.outf.write("[%s]\n" % (title,))
157 for name, value in items:
158 if verbose_formatter is not None:
159 value = verbose_formatter(value)
160 if type(name) == str:
161 name = name.replace(' ', '-')
162 self.outf.write("%s = %s\n" % (name, value))
163 self.outf.write("\n")
164 else:
165 self.outf.write("%s:\n" % (title,))
166 for name, value in items:
167 if normal_formatter is not None:
168 value = normal_formatter(value)
169 self.outf.write("\t%s\t%s\n" % (value, name))
170
171 def progress_handler(self, cmd):
172 """Process a ProgressCommand."""
173 self.cmd_counts[cmd.name] += 1
174
175 def blob_handler(self, cmd):
176 """Process a BlobCommand."""
177 self.cmd_counts[cmd.name] += 1
178 if cmd.mark is None:
179 self.blobs['unmarked'].add(cmd.id)
180 else:
181 self.blobs['new'].add(cmd.id)
182 # Marks can be re-used so remove it from used if already there.
183 # Note: we definitely do NOT want to remove it from multi if
184 # it's already in that set.
185 try:
186 self.blobs['used'].remove(cmd.id)
187 except KeyError:
188 pass
189
190 def checkpoint_handler(self, cmd):
191 """Process a CheckpointCommand."""
192 self.cmd_counts[cmd.name] += 1
193
194 def commit_handler(self, cmd):
195 """Process a CommitCommand."""
196 self.cmd_counts[cmd.name] += 1
197 self.committers.add(cmd.committer)
198 if cmd.author is not None:
199 self.separate_authors_found = True
200 for fc in cmd.iter_files():
201 self.file_cmd_counts[fc.name] += 1
202 if isinstance(fc, commands.FileModifyCommand):
203 if fc.mode & 0o111:
204 self.executables_found = True
205 if stat.S_ISLNK(fc.mode):
206 self.symlinks_found = True
207 if fc.dataref is not None:
208 if fc.dataref[0] == ':':
209 self._track_blob(fc.dataref)
210 else:
211 self.sha_blob_references = True
212 elif isinstance(fc, commands.FileRenameCommand):
213 self.rename_old_paths.setdefault(cmd.id, set()).add(fc.old_path)
214 elif isinstance(fc, commands.FileCopyCommand):
215 self.copy_source_paths.setdefault(cmd.id, set()).add(fc.src_path)
216
217 # Track the heads
218 parents = self.reftracker.track_heads(cmd)
219
220 # Track the parent counts
221 parent_count = len(parents)
222 try:
223 self.parent_counts[parent_count] += 1
224 except KeyError:
225 self.parent_counts[parent_count] = 1
226 if parent_count > self.max_parent_count:
227 self.max_parent_count = parent_count
228
229 # Remember the merges
230 if cmd.merges:
231 #self.merges.setdefault(cmd.ref, set()).update(cmd.merges)
232 for merge in cmd.merges:
233 if merge in self.merges:
234 self.merges[merge] += 1
235 else:
236 self.merges[merge] = 1
237
238 def reset_handler(self, cmd):
239 """Process a ResetCommand."""
240 self.cmd_counts[cmd.name] += 1
241 if cmd.ref.startswith('refs/tags/'):
242 self.lightweight_tags += 1
243 else:
244 if cmd.from_ is not None:
245 self.reftracker.track_heads_for_ref(
246 cmd.ref, cmd.from_)
247
248 def tag_handler(self, cmd):
249 """Process a TagCommand."""
250 self.cmd_counts[cmd.name] += 1
251
252 def feature_handler(self, cmd):
253 """Process a FeatureCommand."""
254 self.cmd_counts[cmd.name] += 1
255 feature = cmd.feature_name
256 if feature not in commands.FEATURE_NAMES:
257 self.warning("feature %s is not supported - parsing may fail"
258 % (feature,))
259
260 def _track_blob(self, mark):
261 if mark in self.blob_ref_counts:
262 self.blob_ref_counts[mark] += 1
263 pass
264 elif mark in self.blobs['used']:
265 self.blob_ref_counts[mark] = 2
266 self.blobs['used'].remove(mark)
267 elif mark in self.blobs['new']:
268 self.blobs['used'].add(mark)
269 self.blobs['new'].remove(mark)
270 else:
271 self.blobs['unknown'].add(mark)
272
273 def _found(b):
274 """Format a found boolean as a string."""
275 return ['no', 'found'][b]
276
277 def _iterable_as_config_list(s):
278 """Format an iterable as a sequence of comma-separated strings.
279
280 To match what ConfigObj expects, a single item list has a trailing comma.
281 """
282 items = sorted(s)
283 if len(items) == 1:
284 return "%s," % (items[0],)
285 else:
286 return ", ".join(items)