refactoring; bypass the commit creation in each repo and linearization by git; handle it ourselves

author: Brian Harring <ferringb@google.com> 2012-10-16 00:21:07 -0700
committer: Brian Harring <ferringb@google.com> 2012-10-16 13:28:49 -0700
commit: d740be6709ab7ac6a1b271430d650e4381f3f761 (patch)
tree: 61f32e9f96a942891344f5f021ae4052a6ae59c6 /rewrite-commit-dump.py
parent: ongoing work (diff)
download: git-conversion-tools-d740be6709ab7ac6a1b271430d650e4381f3f761.tar.gz
git-conversion-tools-d740be6709ab7ac6a1b271430d650e4381f3f761.tar.bz2
git-conversion-tools-d740be6709ab7ac6a1b271430d650e4381f3f761.zip
1 files changed, 41 insertions, 13 deletions
diff --git a/rewrite-commit-dump.py b/rewrite-commit-dump.py
index 7678406..f657a8e 100755
--- a/rewrite-commit-dump.py
+++ b/rewrite-commit-dump.py
@@ -1,5 +1,7 @@
 #!/usr/bin/python
 import functools
+import operator
+import os
 import re
 import sys
 from collections import namedtuple
@@ -12,10 +14,10 @@ mangler.append(functools.partial(
   re.compile(r"^\(portage version: (.*)\)$", re.M|re.I).sub,
     r"Package-Manager: portage-\1"))
 
-fields = ('mark', 'author', 'committer', 'msg', 'files')
+fields = ('author', 'committer', 'msg', 'files', 'timestamp')
 record = namedtuple('record', fields)
 
-def deserialize_records(source):
+def deserialize_records(source, blob_idx):
   line = source.readline()
   while line:
     while line.split()[0] in ('reset', 'progress'):
@@ -28,9 +30,9 @@ def deserialize_records(source):
       line = source.readline()
       chunks = line.split(None, 1)
       assert len(chunks) == 2, line
-      if chunks[0] == 'from':
+      if chunks[0] in ('from', 'mark'):
         continue
-      assert chunks[0] in ('mark', 'author', 'committer', 'data')
+      assert chunks[0] in ('author', 'committer', 'data')
       if chunks[0] != 'data':
         d[chunks[0]] = chunks[1].strip()
         continue
@@ -63,28 +65,39 @@ def deserialize_records(source):
         files[mode[1]] = (mode[0], line)
       elif mode[0] == 'M':
         # M 100644 e8b9ed651c6209820779382edee2537209aba4ae dev-cpp/gtkmm/ChangeLog
-        chunks = mode[1].split(None, 3)
-        assert len(chunks) == 3, line
-        files[chunks[2]] = (mode[0], line)
+        # if it's not a sha1, but startswith ':'... then it's an index.
+        chunks = line.split(None, 4)
+        assert len(chunks) == 4, line
+        fname = chunks[3]
+        if chunks[2][0] == ':':
+          line = ' '.join(chunks[:2] + [blob_idx[int(chunks[2][1:])], fname])
+        files[fname] = (mode[0], line)
       else:
         raise AssertionError("got unknown file op: mode=%r, line:\n%r" % (mode[0], line))
       line = source.readline()
     d['files'] = files
     # Basic sanity check for the code above...
     assert set(fields).issuperset(d), d
+    d.setdefault('author', d.get('committer'))
+    assert d['author'] is not None
+    # Skank the timestamp out...
+    chunks = d['author'].rsplit(None, 1)
+    assert len(chunks) == 2 and chunks[1] == '+0000', d['author']
+    d['timestamp'] = long(chunks[0].rsplit(None, 1)[1])
     yield record(*[d.get(x) for x in fields])
     # Bleh... of course namedtuple doesn't make this easy.
     line = source.readline()
 
-def serialize_records(records, handle, target='refs/heads/master', progress=1000):
+def serialize_records(records, handle, target='refs/heads/master', progress=5000):
   write = handle.write
   write('reset %s\n' % target)
   total = len(records)
   for idx, record in enumerate(records, 1):
     if idx % progress == 0:
       write('progress %02.1f%%: %i of %i commits\n'
-        % ((100 * float(idx))//total, idx, total))
+        % ((100 * float(idx))/total, idx, total))
     write('commit %s\n' % target)
+    write('mark :%i\n' % idx)
     # fields = ('mark', 'author', 'committer', 'msg', 'files')
     for name, value in zip(fields, record):
       if name == 'files':
@@ -94,17 +107,32 @@ def serialize_records(records, handle, target='refs/heads/master', progress=1000
         write("%s %s\n" % (name, value))
       elif name == 'msg':
         write("data %i\n%s" % (len(value), value))
+      elif name == 'timestamp':
+        continue
       else:
         raise AssertionError("serialize is out of sync; don't know field %s" % name)
     write("\n")
 
+def deserialize_blob_map(source):
+  source = (x.strip().split() for x in source)
+  return dict((int(x[0].lstrip(':')), x[1]) for x in source)
+
 def main(argv):
-  source = open(argv[0], 'r') if argv else sys.stdin
-  records = list(deserialize_records(source))
+  records = []
+  source = argv if argv else sys.stdin
+  directories = [x.strip() for x in source]
+  for directory in directories:
+    tmp = os.path.join(directory, 'cvs2svn-tmp')
+    commits = os.path.join(tmp, 'git-dump.dat')
+    if not os.path.exists(commits):
+      sys.stderr.write("skipping %s; no commit data\n" % directory)
+      continue
+    blob_index = deserialize_blob_map(open(os.path.join(tmp, 'git-blob.idx')))
+    records.extend(deserialize_records(open(commits, 'r'), blob_index))
+  records.sort(key=operator.attrgetter('timestamp'))
+  #records = list(deserialize_records(source))
   serialize_records(records, sys.stdout)
   return 0
 
 if __name__ == '__main__':
-  if len(sys.argv) not in (1, 2):
-    raise SystemExit("args must be either none, or path to fast-export stream to read", code=1)
   sys.exit(main(sys.argv[1:]))
author	Brian Harring <ferringb@google.com>	2012-10-16 00:21:07 -0700
committer	Brian Harring <ferringb@google.com>	2012-10-16 13:28:49 -0700
commit	d740be6709ab7ac6a1b271430d650e4381f3f761 (patch)
tree	61f32e9f96a942891344f5f021ae4052a6ae59c6 /rewrite-commit-dump.py
parent	ongoing work (diff)
download	git-conversion-tools-d740be6709ab7ac6a1b271430d650e4381f3f761.tar.gz git-conversion-tools-d740be6709ab7ac6a1b271430d650e4381f3f761.tar.bz2 git-conversion-tools-d740be6709ab7ac6a1b271430d650e4381f3f761.zip