Mercurial > hg > hg-fastimport
comparison hgext3rd/fastimport/vendor/python_fastimport/processors/info_processor.py @ 86:28704a2a7461 vendor/python-fastimport
Import python-fastimport-0.9.8
| author | Roy Marples <roy@marples.name> |
|---|---|
| date | Tue, 19 Jan 2021 22:56:34 +0000 |
| parents | |
| children | 2fc99e3479d9 |
comparison
equal
deleted
inserted
replaced
| 85:1f5544a8870b | 86:28704a2a7461 |
|---|---|
| 1 # Copyright (C) 2008 Canonical Ltd | |
| 2 # | |
| 3 # This program is free software; you can redistribute it and/or modify | |
| 4 # it under the terms of the GNU General Public License as published by | |
| 5 # the Free Software Foundation; either version 2 of the License, or | |
| 6 # (at your option) any later version. | |
| 7 # | |
| 8 # This program is distributed in the hope that it will be useful, | |
| 9 # but WITHOUT ANY WARRANTY; without even the implied warranty of | |
| 10 # MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the | |
| 11 # GNU General Public License for more details. | |
| 12 # | |
| 13 # You should have received a copy of the GNU General Public License | |
| 14 # along with this program. If not, see <http://www.gnu.org/licenses/>. | |
| 15 | |
| 16 """Import processor that dump stats about the input (and doesn't import).""" | |
| 17 | |
| 18 from __future__ import absolute_import | |
| 19 | |
| 20 from .. import ( | |
| 21 reftracker, | |
| 22 ) | |
| 23 from ..helpers import ( | |
| 24 invert_dict, | |
| 25 invert_dictset, | |
| 26 ) | |
| 27 from fastimport import ( | |
| 28 commands, | |
| 29 processor, | |
| 30 ) | |
| 31 import stat | |
| 32 | |
| 33 | |
| 34 class InfoProcessor(processor.ImportProcessor): | |
| 35 """An import processor that dumps statistics about the input. | |
| 36 | |
| 37 No changes to the current repository are made. | |
| 38 | |
| 39 As well as providing useful information about an import | |
| 40 stream before importing it, this processor is useful for | |
| 41 benchmarking the speed at which data can be extracted from | |
| 42 the source. | |
| 43 """ | |
| 44 | |
| 45 def __init__(self, params=None, verbose=0, outf=None): | |
| 46 processor.ImportProcessor.__init__(self, params, verbose, | |
| 47 outf=outf) | |
| 48 | |
| 49 def pre_process(self): | |
| 50 # Init statistics | |
| 51 self.cmd_counts = {} | |
| 52 for cmd in commands.COMMAND_NAMES: | |
| 53 self.cmd_counts[cmd] = 0 | |
| 54 self.file_cmd_counts = {} | |
| 55 for fc in commands.FILE_COMMAND_NAMES: | |
| 56 self.file_cmd_counts[fc] = 0 | |
| 57 self.parent_counts = {} | |
| 58 self.max_parent_count = 0 | |
| 59 self.committers = set() | |
| 60 self.separate_authors_found = False | |
| 61 self.symlinks_found = False | |
| 62 self.executables_found = False | |
| 63 self.sha_blob_references = False | |
| 64 self.lightweight_tags = 0 | |
| 65 # Blob usage tracking | |
| 66 self.blobs = {} | |
| 67 for usage in ['new', 'used', 'unknown', 'unmarked']: | |
| 68 self.blobs[usage] = set() | |
| 69 self.blob_ref_counts = {} | |
| 70 # Head tracking | |
| 71 self.reftracker = reftracker.RefTracker() | |
| 72 # Stuff to cache: a map from mark to # of times that mark is merged | |
| 73 self.merges = {} | |
| 74 # Stuff to cache: these are maps from mark to sets | |
| 75 self.rename_old_paths = {} | |
| 76 self.copy_source_paths = {} | |
| 77 | |
| 78 def post_process(self): | |
| 79 # Dump statistics | |
| 80 cmd_names = commands.COMMAND_NAMES | |
| 81 fc_names = commands.FILE_COMMAND_NAMES | |
| 82 self._dump_stats_group("Command counts", | |
| 83 [(c.decode('utf-8'), self.cmd_counts[c]) for c in cmd_names], str) | |
| 84 self._dump_stats_group("File command counts", | |
| 85 [(c.decode('utf-8'), self.file_cmd_counts[c]) for c in fc_names], str) | |
| 86 | |
| 87 # Commit stats | |
| 88 if self.cmd_counts[b'commit']: | |
| 89 p_items = [] | |
| 90 for i in range(self.max_parent_count + 1): | |
| 91 if i in self.parent_counts: | |
| 92 count = self.parent_counts[i] | |
| 93 p_items.append(("parents-%d" % i, count)) | |
| 94 merges_count = len(self.merges) | |
| 95 p_items.append(('total revisions merged', merges_count)) | |
| 96 flags = { | |
| 97 'separate authors found': self.separate_authors_found, | |
| 98 'executables': self.executables_found, | |
| 99 'symlinks': self.symlinks_found, | |
| 100 'blobs referenced by SHA': self.sha_blob_references, | |
| 101 } | |
| 102 self._dump_stats_group("Parent counts", p_items, str) | |
| 103 self._dump_stats_group("Commit analysis", sorted(flags.items()), _found) | |
| 104 heads = invert_dictset(self.reftracker.heads) | |
| 105 self._dump_stats_group( | |
| 106 "Head analysis", | |
| 107 [(k.decode('utf-8'), | |
| 108 ', '.join([m.decode('utf-8') for m in v])) | |
| 109 for (k, v) in heads.items()], None, | |
| 110 _iterable_as_config_list) | |
| 111 # note("\t%d\t%s" % (len(self.committers), 'unique committers')) | |
| 112 self._dump_stats_group("Merges", self.merges.items(), None) | |
| 113 # We only show the rename old path and copy source paths when -vv | |
| 114 # (verbose=2) is specified. The output here for mysql's data can't | |
| 115 # be parsed currently so this bit of code needs more work anyhow .. | |
| 116 if self.verbose >= 2: | |
| 117 self._dump_stats_group("Rename old paths", | |
| 118 self.rename_old_paths.items(), len, | |
| 119 _iterable_as_config_list) | |
| 120 self._dump_stats_group("Copy source paths", | |
| 121 self.copy_source_paths.items(), len, | |
| 122 _iterable_as_config_list) | |
| 123 | |
| 124 # Blob stats | |
| 125 if self.cmd_counts[b'blob']: | |
| 126 # In verbose mode, don't list every blob used | |
| 127 if self.verbose: | |
| 128 del self.blobs['used'] | |
| 129 self._dump_stats_group("Blob usage tracking", | |
| 130 self.blobs.items(), len, _iterable_as_config_list) | |
| 131 if self.blob_ref_counts: | |
| 132 blobs_by_count = invert_dict(self.blob_ref_counts) | |
| 133 blob_items = sorted(blobs_by_count.items()) | |
| 134 self._dump_stats_group("Blob reference counts", | |
| 135 blob_items, len, _iterable_as_config_list) | |
| 136 | |
| 137 # Other stats | |
| 138 if self.cmd_counts[b'reset']: | |
| 139 reset_stats = { | |
| 140 'lightweight tags': self.lightweight_tags, | |
| 141 } | |
| 142 self._dump_stats_group("Reset analysis", reset_stats.items()) | |
| 143 | |
| 144 def _dump_stats_group(self, title, items, normal_formatter=None, | |
| 145 verbose_formatter=None): | |
| 146 """Dump a statistics group. | |
| 147 | |
| 148 In verbose mode, do so as a config file so | |
| 149 that other processors can load the information if they want to. | |
| 150 :param normal_formatter: the callable to apply to the value | |
| 151 before displaying it in normal mode | |
| 152 :param verbose_formatter: the callable to apply to the value | |
| 153 before displaying it in verbose mode | |
| 154 """ | |
| 155 if self.verbose: | |
| 156 self.outf.write("[%s]\n" % (title,)) | |
| 157 for name, value in items: | |
| 158 if verbose_formatter is not None: | |
| 159 value = verbose_formatter(value) | |
| 160 if type(name) == str: | |
| 161 name = name.replace(' ', '-') | |
| 162 self.outf.write("%s = %s\n" % (name, value)) | |
| 163 self.outf.write("\n") | |
| 164 else: | |
| 165 self.outf.write("%s:\n" % (title,)) | |
| 166 for name, value in items: | |
| 167 if normal_formatter is not None: | |
| 168 value = normal_formatter(value) | |
| 169 self.outf.write("\t%s\t%s\n" % (value, name)) | |
| 170 | |
| 171 def progress_handler(self, cmd): | |
| 172 """Process a ProgressCommand.""" | |
| 173 self.cmd_counts[cmd.name] += 1 | |
| 174 | |
| 175 def blob_handler(self, cmd): | |
| 176 """Process a BlobCommand.""" | |
| 177 self.cmd_counts[cmd.name] += 1 | |
| 178 if cmd.mark is None: | |
| 179 self.blobs['unmarked'].add(cmd.id) | |
| 180 else: | |
| 181 self.blobs['new'].add(cmd.id) | |
| 182 # Marks can be re-used so remove it from used if already there. | |
| 183 # Note: we definitely do NOT want to remove it from multi if | |
| 184 # it's already in that set. | |
| 185 try: | |
| 186 self.blobs['used'].remove(cmd.id) | |
| 187 except KeyError: | |
| 188 pass | |
| 189 | |
| 190 def checkpoint_handler(self, cmd): | |
| 191 """Process a CheckpointCommand.""" | |
| 192 self.cmd_counts[cmd.name] += 1 | |
| 193 | |
| 194 def commit_handler(self, cmd): | |
| 195 """Process a CommitCommand.""" | |
| 196 self.cmd_counts[cmd.name] += 1 | |
| 197 self.committers.add(cmd.committer) | |
| 198 if cmd.author is not None: | |
| 199 self.separate_authors_found = True | |
| 200 for fc in cmd.iter_files(): | |
| 201 self.file_cmd_counts[fc.name] += 1 | |
| 202 if isinstance(fc, commands.FileModifyCommand): | |
| 203 if fc.mode & 0o111: | |
| 204 self.executables_found = True | |
| 205 if stat.S_ISLNK(fc.mode): | |
| 206 self.symlinks_found = True | |
| 207 if fc.dataref is not None: | |
| 208 if fc.dataref[0] == ':': | |
| 209 self._track_blob(fc.dataref) | |
| 210 else: | |
| 211 self.sha_blob_references = True | |
| 212 elif isinstance(fc, commands.FileRenameCommand): | |
| 213 self.rename_old_paths.setdefault(cmd.id, set()).add(fc.old_path) | |
| 214 elif isinstance(fc, commands.FileCopyCommand): | |
| 215 self.copy_source_paths.setdefault(cmd.id, set()).add(fc.src_path) | |
| 216 | |
| 217 # Track the heads | |
| 218 parents = self.reftracker.track_heads(cmd) | |
| 219 | |
| 220 # Track the parent counts | |
| 221 parent_count = len(parents) | |
| 222 try: | |
| 223 self.parent_counts[parent_count] += 1 | |
| 224 except KeyError: | |
| 225 self.parent_counts[parent_count] = 1 | |
| 226 if parent_count > self.max_parent_count: | |
| 227 self.max_parent_count = parent_count | |
| 228 | |
| 229 # Remember the merges | |
| 230 if cmd.merges: | |
| 231 #self.merges.setdefault(cmd.ref, set()).update(cmd.merges) | |
| 232 for merge in cmd.merges: | |
| 233 if merge in self.merges: | |
| 234 self.merges[merge] += 1 | |
| 235 else: | |
| 236 self.merges[merge] = 1 | |
| 237 | |
| 238 def reset_handler(self, cmd): | |
| 239 """Process a ResetCommand.""" | |
| 240 self.cmd_counts[cmd.name] += 1 | |
| 241 if cmd.ref.startswith('refs/tags/'): | |
| 242 self.lightweight_tags += 1 | |
| 243 else: | |
| 244 if cmd.from_ is not None: | |
| 245 self.reftracker.track_heads_for_ref( | |
| 246 cmd.ref, cmd.from_) | |
| 247 | |
| 248 def tag_handler(self, cmd): | |
| 249 """Process a TagCommand.""" | |
| 250 self.cmd_counts[cmd.name] += 1 | |
| 251 | |
| 252 def feature_handler(self, cmd): | |
| 253 """Process a FeatureCommand.""" | |
| 254 self.cmd_counts[cmd.name] += 1 | |
| 255 feature = cmd.feature_name | |
| 256 if feature not in commands.FEATURE_NAMES: | |
| 257 self.warning("feature %s is not supported - parsing may fail" | |
| 258 % (feature,)) | |
| 259 | |
| 260 def _track_blob(self, mark): | |
| 261 if mark in self.blob_ref_counts: | |
| 262 self.blob_ref_counts[mark] += 1 | |
| 263 pass | |
| 264 elif mark in self.blobs['used']: | |
| 265 self.blob_ref_counts[mark] = 2 | |
| 266 self.blobs['used'].remove(mark) | |
| 267 elif mark in self.blobs['new']: | |
| 268 self.blobs['used'].add(mark) | |
| 269 self.blobs['new'].remove(mark) | |
| 270 else: | |
| 271 self.blobs['unknown'].add(mark) | |
| 272 | |
| 273 def _found(b): | |
| 274 """Format a found boolean as a string.""" | |
| 275 return ['no', 'found'][b] | |
| 276 | |
| 277 def _iterable_as_config_list(s): | |
| 278 """Format an iterable as a sequence of comma-separated strings. | |
| 279 | |
| 280 To match what ConfigObj expects, a single item list has a trailing comma. | |
| 281 """ | |
| 282 items = sorted(s) | |
| 283 if len(items) == 1: | |
| 284 return "%s," % (items[0],) | |
| 285 else: | |
| 286 return ", ".join(items) |
