aboutsummaryrefslogtreecommitdiff
diff options
context:
space:
mode:
authorBrian Harring <ferringb@google.com>2012-10-16 00:21:07 -0700
committerBrian Harring <ferringb@google.com>2012-10-16 13:28:49 -0700
commitd740be6709ab7ac6a1b271430d650e4381f3f761 (patch)
tree61f32e9f96a942891344f5f021ae4052a6ae59c6 /rewrite-commit-dump.py
parentongoing work (diff)
downloadgit-conversion-tools-d740be6709ab7ac6a1b271430d650e4381f3f761.tar.gz
git-conversion-tools-d740be6709ab7ac6a1b271430d650e4381f3f761.tar.bz2
git-conversion-tools-d740be6709ab7ac6a1b271430d650e4381f3f761.zip
refactoring; bypass the commit creation in each repo and linearization by git; handle it ourselves
Diffstat (limited to 'rewrite-commit-dump.py')
-rwxr-xr-xrewrite-commit-dump.py54
1 files changed, 41 insertions, 13 deletions
diff --git a/rewrite-commit-dump.py b/rewrite-commit-dump.py
index 7678406..f657a8e 100755
--- a/rewrite-commit-dump.py
+++ b/rewrite-commit-dump.py
@@ -1,5 +1,7 @@
#!/usr/bin/python
import functools
+import operator
+import os
import re
import sys
from collections import namedtuple
@@ -12,10 +14,10 @@ mangler.append(functools.partial(
re.compile(r"^\(portage version: (.*)\)$", re.M|re.I).sub,
r"Package-Manager: portage-\1"))
-fields = ('mark', 'author', 'committer', 'msg', 'files')
+fields = ('author', 'committer', 'msg', 'files', 'timestamp')
record = namedtuple('record', fields)
-def deserialize_records(source):
+def deserialize_records(source, blob_idx):
line = source.readline()
while line:
while line.split()[0] in ('reset', 'progress'):
@@ -28,9 +30,9 @@ def deserialize_records(source):
line = source.readline()
chunks = line.split(None, 1)
assert len(chunks) == 2, line
- if chunks[0] == 'from':
+ if chunks[0] in ('from', 'mark'):
continue
- assert chunks[0] in ('mark', 'author', 'committer', 'data')
+ assert chunks[0] in ('author', 'committer', 'data')
if chunks[0] != 'data':
d[chunks[0]] = chunks[1].strip()
continue
@@ -63,28 +65,39 @@ def deserialize_records(source):
files[mode[1]] = (mode[0], line)
elif mode[0] == 'M':
# M 100644 e8b9ed651c6209820779382edee2537209aba4ae dev-cpp/gtkmm/ChangeLog
- chunks = mode[1].split(None, 3)
- assert len(chunks) == 3, line
- files[chunks[2]] = (mode[0], line)
+ # if it's not a sha1, but startswith ':'... then it's an index.
+ chunks = line.split(None, 4)
+ assert len(chunks) == 4, line
+ fname = chunks[3]
+ if chunks[2][0] == ':':
+ line = ' '.join(chunks[:2] + [blob_idx[int(chunks[2][1:])], fname])
+ files[fname] = (mode[0], line)
else:
raise AssertionError("got unknown file op: mode=%r, line:\n%r" % (mode[0], line))
line = source.readline()
d['files'] = files
# Basic sanity check for the code above...
assert set(fields).issuperset(d), d
+ d.setdefault('author', d.get('committer'))
+ assert d['author'] is not None
+ # Skank the timestamp out...
+ chunks = d['author'].rsplit(None, 1)
+ assert len(chunks) == 2 and chunks[1] == '+0000', d['author']
+ d['timestamp'] = long(chunks[0].rsplit(None, 1)[1])
yield record(*[d.get(x) for x in fields])
# Bleh... of course namedtuple doesn't make this easy.
line = source.readline()
-def serialize_records(records, handle, target='refs/heads/master', progress=1000):
+def serialize_records(records, handle, target='refs/heads/master', progress=5000):
write = handle.write
write('reset %s\n' % target)
total = len(records)
for idx, record in enumerate(records, 1):
if idx % progress == 0:
write('progress %02.1f%%: %i of %i commits\n'
- % ((100 * float(idx))//total, idx, total))
+ % ((100 * float(idx))/total, idx, total))
write('commit %s\n' % target)
+ write('mark :%i\n' % idx)
# fields = ('mark', 'author', 'committer', 'msg', 'files')
for name, value in zip(fields, record):
if name == 'files':
@@ -94,17 +107,32 @@ def serialize_records(records, handle, target='refs/heads/master', progress=1000
write("%s %s\n" % (name, value))
elif name == 'msg':
write("data %i\n%s" % (len(value), value))
+ elif name == 'timestamp':
+ continue
else:
raise AssertionError("serialize is out of sync; don't know field %s" % name)
write("\n")
+def deserialize_blob_map(source):
+ source = (x.strip().split() for x in source)
+ return dict((int(x[0].lstrip(':')), x[1]) for x in source)
+
def main(argv):
- source = open(argv[0], 'r') if argv else sys.stdin
- records = list(deserialize_records(source))
+ records = []
+ source = argv if argv else sys.stdin
+ directories = [x.strip() for x in source]
+ for directory in directories:
+ tmp = os.path.join(directory, 'cvs2svn-tmp')
+ commits = os.path.join(tmp, 'git-dump.dat')
+ if not os.path.exists(commits):
+ sys.stderr.write("skipping %s; no commit data\n" % directory)
+ continue
+ blob_index = deserialize_blob_map(open(os.path.join(tmp, 'git-blob.idx')))
+ records.extend(deserialize_records(open(commits, 'r'), blob_index))
+ records.sort(key=operator.attrgetter('timestamp'))
+ #records = list(deserialize_records(source))
serialize_records(records, sys.stdout)
return 0
if __name__ == '__main__':
- if len(sys.argv) not in (1, 2):
- raise SystemExit("args must be either none, or path to fast-export stream to read", code=1)
sys.exit(main(sys.argv[1:]))