Mercurial > hg > hg-fastimport
annotate hgext3rd/fastimport/vendor/python_fastimport/processors/info_processor.py @ 88:2fc99e3479d9
python-fastimport: Import our own modules using relative pathing
This allows python-fastimport to be embedded as vendor code within
other modules.
This patch has been accepted upstream.
| author | Roy Marples <roy@marples.name> |
|---|---|
| date | Tue, 19 Jan 2021 23:00:01 +0000 |
| parents | 28704a2a7461 |
| children |
| rev | line source |
|---|---|
| 86 | 1 # Copyright (C) 2008 Canonical Ltd |
| 2 # | |
| 3 # This program is free software; you can redistribute it and/or modify | |
| 4 # it under the terms of the GNU General Public License as published by | |
| 5 # the Free Software Foundation; either version 2 of the License, or | |
| 6 # (at your option) any later version. | |
| 7 # | |
| 8 # This program is distributed in the hope that it will be useful, | |
| 9 # but WITHOUT ANY WARRANTY; without even the implied warranty of | |
| 10 # MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the | |
| 11 # GNU General Public License for more details. | |
| 12 # | |
| 13 # You should have received a copy of the GNU General Public License | |
| 14 # along with this program. If not, see <http://www.gnu.org/licenses/>. | |
| 15 | |
| 16 """Import processor that dump stats about the input (and doesn't import).""" | |
| 17 | |
| 18 from __future__ import absolute_import | |
| 19 | |
| 20 from .. import ( | |
|
88
2fc99e3479d9
python-fastimport: Import our own modules using relative pathing
Roy Marples <roy@marples.name>
parents:
86
diff
changeset
|
21 commands, |
|
2fc99e3479d9
python-fastimport: Import our own modules using relative pathing
Roy Marples <roy@marples.name>
parents:
86
diff
changeset
|
22 processor, |
| 86 | 23 reftracker, |
| 24 ) | |
| 25 from ..helpers import ( | |
| 26 invert_dict, | |
| 27 invert_dictset, | |
| 28 ) | |
| 29 import stat | |
| 30 | |
| 31 | |
| 32 class InfoProcessor(processor.ImportProcessor): | |
| 33 """An import processor that dumps statistics about the input. | |
| 34 | |
| 35 No changes to the current repository are made. | |
| 36 | |
| 37 As well as providing useful information about an import | |
| 38 stream before importing it, this processor is useful for | |
| 39 benchmarking the speed at which data can be extracted from | |
| 40 the source. | |
| 41 """ | |
| 42 | |
| 43 def __init__(self, params=None, verbose=0, outf=None): | |
| 44 processor.ImportProcessor.__init__(self, params, verbose, | |
| 45 outf=outf) | |
| 46 | |
| 47 def pre_process(self): | |
| 48 # Init statistics | |
| 49 self.cmd_counts = {} | |
| 50 for cmd in commands.COMMAND_NAMES: | |
| 51 self.cmd_counts[cmd] = 0 | |
| 52 self.file_cmd_counts = {} | |
| 53 for fc in commands.FILE_COMMAND_NAMES: | |
| 54 self.file_cmd_counts[fc] = 0 | |
| 55 self.parent_counts = {} | |
| 56 self.max_parent_count = 0 | |
| 57 self.committers = set() | |
| 58 self.separate_authors_found = False | |
| 59 self.symlinks_found = False | |
| 60 self.executables_found = False | |
| 61 self.sha_blob_references = False | |
| 62 self.lightweight_tags = 0 | |
| 63 # Blob usage tracking | |
| 64 self.blobs = {} | |
| 65 for usage in ['new', 'used', 'unknown', 'unmarked']: | |
| 66 self.blobs[usage] = set() | |
| 67 self.blob_ref_counts = {} | |
| 68 # Head tracking | |
| 69 self.reftracker = reftracker.RefTracker() | |
| 70 # Stuff to cache: a map from mark to # of times that mark is merged | |
| 71 self.merges = {} | |
| 72 # Stuff to cache: these are maps from mark to sets | |
| 73 self.rename_old_paths = {} | |
| 74 self.copy_source_paths = {} | |
| 75 | |
| 76 def post_process(self): | |
| 77 # Dump statistics | |
| 78 cmd_names = commands.COMMAND_NAMES | |
| 79 fc_names = commands.FILE_COMMAND_NAMES | |
| 80 self._dump_stats_group("Command counts", | |
| 81 [(c.decode('utf-8'), self.cmd_counts[c]) for c in cmd_names], str) | |
| 82 self._dump_stats_group("File command counts", | |
| 83 [(c.decode('utf-8'), self.file_cmd_counts[c]) for c in fc_names], str) | |
| 84 | |
| 85 # Commit stats | |
| 86 if self.cmd_counts[b'commit']: | |
| 87 p_items = [] | |
| 88 for i in range(self.max_parent_count + 1): | |
| 89 if i in self.parent_counts: | |
| 90 count = self.parent_counts[i] | |
| 91 p_items.append(("parents-%d" % i, count)) | |
| 92 merges_count = len(self.merges) | |
| 93 p_items.append(('total revisions merged', merges_count)) | |
| 94 flags = { | |
| 95 'separate authors found': self.separate_authors_found, | |
| 96 'executables': self.executables_found, | |
| 97 'symlinks': self.symlinks_found, | |
| 98 'blobs referenced by SHA': self.sha_blob_references, | |
| 99 } | |
| 100 self._dump_stats_group("Parent counts", p_items, str) | |
| 101 self._dump_stats_group("Commit analysis", sorted(flags.items()), _found) | |
| 102 heads = invert_dictset(self.reftracker.heads) | |
| 103 self._dump_stats_group( | |
| 104 "Head analysis", | |
| 105 [(k.decode('utf-8'), | |
| 106 ', '.join([m.decode('utf-8') for m in v])) | |
| 107 for (k, v) in heads.items()], None, | |
| 108 _iterable_as_config_list) | |
| 109 # note("\t%d\t%s" % (len(self.committers), 'unique committers')) | |
| 110 self._dump_stats_group("Merges", self.merges.items(), None) | |
| 111 # We only show the rename old path and copy source paths when -vv | |
| 112 # (verbose=2) is specified. The output here for mysql's data can't | |
| 113 # be parsed currently so this bit of code needs more work anyhow .. | |
| 114 if self.verbose >= 2: | |
| 115 self._dump_stats_group("Rename old paths", | |
| 116 self.rename_old_paths.items(), len, | |
| 117 _iterable_as_config_list) | |
| 118 self._dump_stats_group("Copy source paths", | |
| 119 self.copy_source_paths.items(), len, | |
| 120 _iterable_as_config_list) | |
| 121 | |
| 122 # Blob stats | |
| 123 if self.cmd_counts[b'blob']: | |
| 124 # In verbose mode, don't list every blob used | |
| 125 if self.verbose: | |
| 126 del self.blobs['used'] | |
| 127 self._dump_stats_group("Blob usage tracking", | |
| 128 self.blobs.items(), len, _iterable_as_config_list) | |
| 129 if self.blob_ref_counts: | |
| 130 blobs_by_count = invert_dict(self.blob_ref_counts) | |
| 131 blob_items = sorted(blobs_by_count.items()) | |
| 132 self._dump_stats_group("Blob reference counts", | |
| 133 blob_items, len, _iterable_as_config_list) | |
| 134 | |
| 135 # Other stats | |
| 136 if self.cmd_counts[b'reset']: | |
| 137 reset_stats = { | |
| 138 'lightweight tags': self.lightweight_tags, | |
| 139 } | |
| 140 self._dump_stats_group("Reset analysis", reset_stats.items()) | |
| 141 | |
| 142 def _dump_stats_group(self, title, items, normal_formatter=None, | |
| 143 verbose_formatter=None): | |
| 144 """Dump a statistics group. | |
| 145 | |
| 146 In verbose mode, do so as a config file so | |
| 147 that other processors can load the information if they want to. | |
| 148 :param normal_formatter: the callable to apply to the value | |
| 149 before displaying it in normal mode | |
| 150 :param verbose_formatter: the callable to apply to the value | |
| 151 before displaying it in verbose mode | |
| 152 """ | |
| 153 if self.verbose: | |
| 154 self.outf.write("[%s]\n" % (title,)) | |
| 155 for name, value in items: | |
| 156 if verbose_formatter is not None: | |
| 157 value = verbose_formatter(value) | |
| 158 if type(name) == str: | |
| 159 name = name.replace(' ', '-') | |
| 160 self.outf.write("%s = %s\n" % (name, value)) | |
| 161 self.outf.write("\n") | |
| 162 else: | |
| 163 self.outf.write("%s:\n" % (title,)) | |
| 164 for name, value in items: | |
| 165 if normal_formatter is not None: | |
| 166 value = normal_formatter(value) | |
| 167 self.outf.write("\t%s\t%s\n" % (value, name)) | |
| 168 | |
| 169 def progress_handler(self, cmd): | |
| 170 """Process a ProgressCommand.""" | |
| 171 self.cmd_counts[cmd.name] += 1 | |
| 172 | |
| 173 def blob_handler(self, cmd): | |
| 174 """Process a BlobCommand.""" | |
| 175 self.cmd_counts[cmd.name] += 1 | |
| 176 if cmd.mark is None: | |
| 177 self.blobs['unmarked'].add(cmd.id) | |
| 178 else: | |
| 179 self.blobs['new'].add(cmd.id) | |
| 180 # Marks can be re-used so remove it from used if already there. | |
| 181 # Note: we definitely do NOT want to remove it from multi if | |
| 182 # it's already in that set. | |
| 183 try: | |
| 184 self.blobs['used'].remove(cmd.id) | |
| 185 except KeyError: | |
| 186 pass | |
| 187 | |
| 188 def checkpoint_handler(self, cmd): | |
| 189 """Process a CheckpointCommand.""" | |
| 190 self.cmd_counts[cmd.name] += 1 | |
| 191 | |
| 192 def commit_handler(self, cmd): | |
| 193 """Process a CommitCommand.""" | |
| 194 self.cmd_counts[cmd.name] += 1 | |
| 195 self.committers.add(cmd.committer) | |
| 196 if cmd.author is not None: | |
| 197 self.separate_authors_found = True | |
| 198 for fc in cmd.iter_files(): | |
| 199 self.file_cmd_counts[fc.name] += 1 | |
| 200 if isinstance(fc, commands.FileModifyCommand): | |
| 201 if fc.mode & 0o111: | |
| 202 self.executables_found = True | |
| 203 if stat.S_ISLNK(fc.mode): | |
| 204 self.symlinks_found = True | |
| 205 if fc.dataref is not None: | |
| 206 if fc.dataref[0] == ':': | |
| 207 self._track_blob(fc.dataref) | |
| 208 else: | |
| 209 self.sha_blob_references = True | |
| 210 elif isinstance(fc, commands.FileRenameCommand): | |
| 211 self.rename_old_paths.setdefault(cmd.id, set()).add(fc.old_path) | |
| 212 elif isinstance(fc, commands.FileCopyCommand): | |
| 213 self.copy_source_paths.setdefault(cmd.id, set()).add(fc.src_path) | |
| 214 | |
| 215 # Track the heads | |
| 216 parents = self.reftracker.track_heads(cmd) | |
| 217 | |
| 218 # Track the parent counts | |
| 219 parent_count = len(parents) | |
| 220 try: | |
| 221 self.parent_counts[parent_count] += 1 | |
| 222 except KeyError: | |
| 223 self.parent_counts[parent_count] = 1 | |
| 224 if parent_count > self.max_parent_count: | |
| 225 self.max_parent_count = parent_count | |
| 226 | |
| 227 # Remember the merges | |
| 228 if cmd.merges: | |
| 229 #self.merges.setdefault(cmd.ref, set()).update(cmd.merges) | |
| 230 for merge in cmd.merges: | |
| 231 if merge in self.merges: | |
| 232 self.merges[merge] += 1 | |
| 233 else: | |
| 234 self.merges[merge] = 1 | |
| 235 | |
| 236 def reset_handler(self, cmd): | |
| 237 """Process a ResetCommand.""" | |
| 238 self.cmd_counts[cmd.name] += 1 | |
| 239 if cmd.ref.startswith('refs/tags/'): | |
| 240 self.lightweight_tags += 1 | |
| 241 else: | |
| 242 if cmd.from_ is not None: | |
| 243 self.reftracker.track_heads_for_ref( | |
| 244 cmd.ref, cmd.from_) | |
| 245 | |
| 246 def tag_handler(self, cmd): | |
| 247 """Process a TagCommand.""" | |
| 248 self.cmd_counts[cmd.name] += 1 | |
| 249 | |
| 250 def feature_handler(self, cmd): | |
| 251 """Process a FeatureCommand.""" | |
| 252 self.cmd_counts[cmd.name] += 1 | |
| 253 feature = cmd.feature_name | |
| 254 if feature not in commands.FEATURE_NAMES: | |
| 255 self.warning("feature %s is not supported - parsing may fail" | |
| 256 % (feature,)) | |
| 257 | |
| 258 def _track_blob(self, mark): | |
| 259 if mark in self.blob_ref_counts: | |
| 260 self.blob_ref_counts[mark] += 1 | |
| 261 pass | |
| 262 elif mark in self.blobs['used']: | |
| 263 self.blob_ref_counts[mark] = 2 | |
| 264 self.blobs['used'].remove(mark) | |
| 265 elif mark in self.blobs['new']: | |
| 266 self.blobs['used'].add(mark) | |
| 267 self.blobs['new'].remove(mark) | |
| 268 else: | |
| 269 self.blobs['unknown'].add(mark) | |
| 270 | |
| 271 def _found(b): | |
| 272 """Format a found boolean as a string.""" | |
| 273 return ['no', 'found'][b] | |
| 274 | |
| 275 def _iterable_as_config_list(s): | |
| 276 """Format an iterable as a sequence of comma-separated strings. | |
| 277 | |
| 278 To match what ConfigObj expects, a single item list has a trailing comma. | |
| 279 """ | |
| 280 items = sorted(s) | |
| 281 if len(items) == 1: | |
| 282 return "%s," % (items[0],) | |
| 283 else: | |
| 284 return ", ".join(items) |
