aboutsummaryrefslogtreecommitdiff
diff options
context:
space:
mode:
authorBrian Harring <ferringb@google.com>2012-10-14 02:57:32 -0700
committerBrian Harring <ferringb@google.com>2012-10-16 13:28:49 -0700
commit09d374560a6dc0d364668264fe2640ffc6e193d6 (patch)
tree476cf2e7fc7a7aba3eade27b595a58ed1c89c134
parentadd usable script (diff)
downloadgit-conversion-tools-09d374560a6dc0d364668264fe2640ffc6e193d6.tar.gz
git-conversion-tools-09d374560a6dc0d364668264fe2640ffc6e193d6.tar.bz2
git-conversion-tools-09d374560a6dc0d364668264fe2640ffc6e193d6.zip
Ongoing mangling
-rw-r--r--README43
-rw-r--r--config4
-rwxr-xr-xcreate-git.sh45
-rwxr-xr-xprocess_directory.sh20
-rwxr-xr-xrewrite-blob-data.py17
-rwxr-xr-xrewrite-commit-dump.py39
-rwxr-xr-xscript.sh13
7 files changed, 162 insertions, 19 deletions
diff --git a/README b/README
new file mode 100644
index 0000000..c56f068
--- /dev/null
+++ b/README
@@ -0,0 +1,43 @@
+Note; this is raw, and likes to dump things in cwd- logs namely.
+
+To run it, first get yourself a copy of gentoo-x86 CVS; place that in
+cvs-repo in this directory- this can be a partial copy of CVS, or full-
+that said, it needs to conform to thus:
+
+$(pwd)/cvs-repo/CVSROOT
+$(pwd)/cvs-repo/gentoo-x86/*
+
+From there, ./script.sh is your main point of entry; it'll process that,
+going parallel, using $(pwd)/output for temp space- it's suggested that
+be tmpfs (much like cvs-repo).
+
+As each category/directory/component is finished, a git repo is generated,
+some basic blob rewrites are done ($Header related). Two core directories
+will exist in each; cvs2svn-tmp (which holds the fast-import data w/in), and
+git; a recomposed bare git repository of that slice of gentoo-x86 history.
+
+Once that category/component is finished, it's moved into $(pwd)/final , and
+another component is started; script.sh currently will run at grep -c MHz /proc/cpuinfo parallelism.
+
+Upon finishing the cvs->git conversion, the content needs to be reintegrated.
+
+create-git.sh exists for this. It looks in $(pwd)/final, and creates the new
+repo in $(pwd)/git/work; this is a bare repo.
+
+Roughly, it does this via generating an empty repo, setting up alternates into slice of
+history, setting up refs/heads/source/* space for each slice of history,
+then forcing a date-ordered fast-export- manipulating the resultant stream
+(stripping resets, rewriting the commit field to point to refs/heads/master, rewriting
+commit messages to convert some basic structured information into git footers), and
+spitting that out.
+
+It creates two dumps of intermediate data as it's going; export-stream-raw , and
+export-stream-rewritten; the first is git fast-export raw output, the second is
+the rewritten stream. Each are ~490MB (they're small due to the fact that
+since we're exporting/importing w/in the same repo, we don't have to send blobs
+through the stream- they can be directly referenced in the command stream).
+
+Now that that is done, we have a recomposed history in refs/heads/master.
+From there, we do prun'ing/gc'ing, and force a git repack -Adf.
+
+That repo is ready to go at that point.
diff --git a/config b/config
index 94c17d7..60cd351 100644
--- a/config
+++ b/config
@@ -171,7 +171,7 @@ ctx.sort_executable = r'sort'
# Change the following line to True if the conversion should only
# include the trunk of the repository (i.e., all branches and tags
# should be omitted from the conversion):
-ctx.trunk_only = False
+ctx.trunk_only = True
# How to convert CVS author names, log messages, and filenames to
# Unicode. The first argument to CVSTextDecoder is a list of encoders
@@ -539,7 +539,7 @@ run_options.set_project(
# The filesystem path to the part of the CVS repository (*not* a
# CVS working copy) that should be converted. This may be a
# subdirectory (i.e., a module) within a larger CVS repository.
- r'cvs-repo',
+ r'cvs-repo/gentoo-x86',
# A list of symbol transformations that can be used to rename
# symbols in this project.
diff --git a/create-git.sh b/create-git.sh
new file mode 100755
index 0000000..6389024
--- /dev/null
+++ b/create-git.sh
@@ -0,0 +1,45 @@
+#!/bin/bash
+
+proc_count=$(grep -c MHz /proc/cpuinfo)
+[ ${proc_count} -eq 0 ] && proc_count=1
+root="$(pwd)"
+mkdir -p git
+rm -rf git/* git/.git
+set -f
+mkdir -p git
+cd git
+git init --bare
+git config core.logAllRefUpdates false
+git config prune.expire now
+mkdir -p objects/info
+targets=( $(find ../final/ -maxdepth 1 -mindepth 1 -printf '../final/%P/\n' | \
+ xargs -n1 readlink -f | tee >(sed -e 's:$:/git/objects:' > objects/info/alternates) ) )
+for x in "${targets[@]}"; do
+ rev=$(git --git-dir $x/git rev-list -1 master 2> /dev/null)
+ [ -z "$rev" ] && { echo "no content: $x"; continue; }
+ x="refs/heads/source/$(basename $x)"
+ git update-ref "$x" $rev
+done
+
+echo "linearizing history, and rewriting messages..."
+
+time (
+ git fast-export --progress=1000 --all --reverse --date-order --no-data | \
+ tee ../export-stream-raw | \
+ "${root}/rewrite-commit-dump.py" | \
+ tee ../export-stream-rewritten | \
+ git fast-import
+) 2>&1 | tee git-creation.log
+
+echo "recomposed; repacking and breaking alternate linkage..."
+# Wipe the strong refs to the other repos...
+git ls-remote . refs/heads/source/'*' | awk '{print $2;}' | xargs -n1 git update-ref -d
+# Localize the content...
+time git repack -Adf --window=100 --depth=100
+# Wipe the alternates.
+rm objects/info/alternates
+echo "doing cleanup..."
+time git prune
+echo "doing basic sanity check"
+time git log -p refs/heads/master > /dev/null || echo "non zero exit code from git log run..."
+echo "Done"
diff --git a/process_directory.sh b/process_directory.sh
index 54c51cf..c9ff6e6 100755
--- a/process_directory.sh
+++ b/process_directory.sh
@@ -1,25 +1,31 @@
#!/bin/bash
+command='
+ sed -re "s/^\(paludis (0.1.*)\)$/Package-manager: Paludis \1/" \
+ -e "s/^\([Pp]ortage version: (.*)\)$/Package-manager: Portage \1/"'
f() {
set -x
- mkdir -p "${output}"/{git,cvs-repo/gentoo-x86/Attic}
+ mkdir -p "${output}"/{git{,-work},cvs-repo/gentoo-x86/Attic}
ln -s "${cvsroot}" "${output}/cvs-repo/CVSROOT"
ln -s "${root}/gentoo-x86/$1" "${output}/cvs-repo/gentoo-x86/$1"
#ln -s "${root}/gentoo-x86/Attic" "${output}/cvs-repo/gentoo-x86/Attic"
ln -s "$(pwd)/config" "${output}/config"
- cd "${output}"
+ # Note- this must be canonical path, else it screws up our $Header rewriting.
+ cd "$(readlink -f "${output}" )"
time cvs2git --options config -vv
cd git
git init --bare
- cat ../cvs2svn-tmp/git-{blob,dump}.dat | git fast-import
- rm -rf "${final}"
+ { "${base}/rewrite-blob-data.py" ../cvs2svn-tmp/git-blob.dat;
+ cat ../cvs2svn-tmp/git-dump.dat;
+ } | git fast-import
+ rm -rf "${final}" git-work
cd "$root"
mv "$output" "${final}"
- git --git-dir "${final}/git" log --pretty=tformat:"%at %H" > "${final}/git-hashes"
set +x
}
[ $# -ne 1 ] && { echo "need an argument..."; exit 1; }
+base="$(pwd)"
root="$(pwd)/cvs-repo"
cvsroot="${root}/CVSROOT"
repo="${root}/gentoo-x86"
@@ -29,6 +35,6 @@ mkdir -p "$(dirname "${final}")"
rm -rf "${output}"
mkdir -p "${output}"
-echo "processing ${1%,v} ${1}"
+echo "processing ${1%,v}" >&2
time f "$1" &> "${output}/"log || { echo "failed $1"; exit 1; }
-echo "processed $1"
+echo "processed $1" >&2
diff --git a/rewrite-blob-data.py b/rewrite-blob-data.py
new file mode 100755
index 0000000..55115a7
--- /dev/null
+++ b/rewrite-blob-data.py
@@ -0,0 +1,17 @@
+#!/usr/bin/python
+import functools
+import os
+import re
+import sys
+
+# $Header: /usr/local/ssd/gentoo-x86/output/.*/.*/cvs-repo/
+# $Header: /usr/local/ssd/gentoo-x86/output/app-accessibility/cvs-repo/gentoo-x86/app-accessibility/SphinxTrain/ChangeLog,v
+base = os.path.dirname(os.path.abspath(__file__))
+mangler = functools.partial(
+ re.compile(r"\$Header: %s/output/.*/cvs-repo/" % base).sub,
+ r"$Header: /var/cvsroot/")
+
+write = sys.stdout.write
+source = open(sys.argv[1]) if len(sys.argv) > 1 else sys.stdin
+for x in source:
+ write(mangler(x))
diff --git a/rewrite-commit-dump.py b/rewrite-commit-dump.py
new file mode 100755
index 0000000..4784cb5
--- /dev/null
+++ b/rewrite-commit-dump.py
@@ -0,0 +1,39 @@
+#!/usr/bin/python
+import functools
+import re
+import sys
+
+mangler = []
+mangler.append(functools.partial(
+ re.compile(r"^\(paludis (0.1.*)\)$", re.M|re.I).sub,
+ r"Package-Manager: paludis-\1/"))
+mangler.append(functools.partial(
+ re.compile(r"^\(portage version: (.*)\)$", re.M|re.I).sub,
+ r"Package-Manager: portage-\1"))
+
+write = sys.stdout.write
+source = open(sys.argv[1]) if len(sys.argv) > 1 else sys.stdin
+write('reset refs/heads/master\n')
+while True:
+ x = source.readline()
+ if not x:
+ break
+ chunked = x.split()
+ if not chunked:
+ write(x)
+ continue
+ elif chunked[0] in ('reset', 'from'):
+ continue
+ elif chunked[0] == 'commit':
+ write('commit refs/heads/master\n')
+ continue
+ elif chunked[0] != 'data':
+ write(x)
+ continue
+ assert len(chunked) == 2
+ size = int(chunked[1])
+ data = source.read(size)
+ assert len(data) == size
+ for func in mangler:
+ data = func(data)
+ write("data %i\n%s" % (len(data), data))
diff --git a/script.sh b/script.sh
index 12c2032..e5a4e35 100755
--- a/script.sh
+++ b/script.sh
@@ -3,20 +3,13 @@
proc_count=$(grep -c MHz /proc/cpuinfo)
[ $proc_count -eq 0 ] && proc_count=1
-rm -rf git
-mkdir git
+rm -rf git/* git/.git final/*
+mkdir git -p
# Prioritize the larger categories first; they typically will have
# the most revs, thus start them first.
time { \
find cvs-repo/gentoo-x86 -maxdepth 1 -mindepth 1 -printf '%P\n' | \
xargs -n1 -I{} -- du -cs "cvs-repo/gentoo-x86/{}" | grep -v 'total$' | \
sort -gr | awk '{print $2;}' | xargs -n1 basename | \
- xargs -n1 -P${proc_count} ./process_directory.sh | \
- {
- cd git;
- git init &> /dev/null
- while read l; do
- git fetch "$(readlink -f "../final/$l/git")" && git merge FETCH_HEAD -m "blah" -q
- done
- }
+ xargs -n1 -P${proc_count} ./process_directory.sh
}