#!/usr/bin/env python3 # pylint: disable=line-too-long,missing-docstring,invalid-name # vim:noet sts=4 ts=4 # Copyright 2016 Robin Johnson # Implementation of https://wiki.gentoo.org/wiki/User:Robbat2:ChangeLog-Generation import argparse import collections import io import itertools import logging import multiprocessing import os import re import sqlite3 import subprocess import sys import tempfile import textwrap import time import portage import portage.exception import portage.util from portage import os, _encodings, _unicode_encode, _unicode_decode from portage.util import writemsg_level from _special_filename import _special_filename portage_settings = None portage_portdbapi = None logger = None args = None dir_contents = collections.defaultdict(list) _repo_path = '/usr/portage/.git' _work_tree = '--git-dir=%s' % _repo_path EXCLUDE_COMMITS = set([ '56bd759df1d0c750a065b8c845e93d5dfa6b549d', ]) #git_log_format = "--format='COMMIT%m%n%H %ct %cN <%cE>%n%BFILES%m'" # Field seperator: # "\0\0FIELDNAME\0DATA" def git_log_field(fieldname, fieldspec): return '%x02%x01' + fieldname + '%x01' + fieldspec git_log_fields = [ '%x01%x01COMMIT%x01', #git_log_field('META', '%H %ct %cN <%cE>'), git_log_field('H', '%H'), # Long hash git_log_field('h', '%h'), # Short hash, used for changelog print git_log_field('P', '%P'), # Parents, used for merge handling git_log_field('ct', '%ct'), # Commit time git_log_field('cN', '%cN'), # Committer name git_log_field('cE', '%cE'), # Committer email git_log_field('at', '%at'), # Author time git_log_field('aN', '%aN'), # Author name git_log_field('aE', '%aE'), # Author email git_log_field('B', '%B'), # Body git_log_field('N', '%N'), # git-notes git_log_field('RAWFILES', ''), # Must be last ] git_log_format = ''.join(git_log_fields) git_log_command = ['git', _work_tree, 'log', '--format=tformat:'+git_log_format, '--name-status', # Used for changelog entries '--no-renames', # Report rename as remove+add '--root', # From top-level tree '-z', # Use NUL to seperate output elements '--reverse', # Oldest-first '--color=never', # No ANSI here. '--topo-order', # TODO: explain why ] #git_log_command += ['-n', '100'] # TODO: convert this into a stream # http://stackoverflow.com/questions/18809524/how-to-read-a-big-binary-file-and-split-its-content-by-some-marker def get_commits_buffer(last_commit='HEAD', first_commit=None, opts=argparse.Namespace()): buf = None if opts.read_git_log_data is not None: buf = opts.read_git_log_data.read().decode('utf-8') opts.read_git_log_data.close() else: # TODO: include first/last commit in command. if opts.debug > 0: #cmdstr = ' '.join(["'{}'".format(s) for s in git_log_command]) cmdstr = ' '.join(["'"+s+"'" for s in git_log_command]) print('Command', cmdstr, flush=True) buf = subprocess.check_output(git_log_command).decode('utf-8') if opts.write_git_log_data is not None: opts.write_git_log_data.write(buf.encode('utf-8')) opts.write_git_log_data.close() return buf # TODO: convert to using a stream instead of a buffer. # http://stackoverflow.com/questions/18809524/how-to-read-a-big-binary-file-and-split-its-content-by-some-marker class CommitGenerator(object): def __init__(self, buf, opts=argparse.Namespace()): self.buf = buf self.regex = re.compile(r'\x01\x01COMMIT\x01', flags=re.MULTILINE) self.buflen = len(self.buf) self.pos = 0 self.regex_iter = self.regex.finditer(self.buf) def __iter__(self): return self def __next__(self): return self.next() def next(self): result = None try: commit = next(self.regex_iter) span = commit.span() result = self.buf[self.pos:span[0]] self.pos = span[1] except StopIteration: if self.pos >= self.buflen: raise StopIteration result = self.buf[self.pos:] self.pos = self.buflen return result def get_commit_fields(buf): regex = re.compile(r'\x02\x01([-A-Z0-9a-z._]+)\x01', flags=re.MULTILINE) pos = 0 last_match = None for match in regex.finditer(buf): if last_match: key = last_match.group(1) val = buf[last_match.span()[1]:match.span()[0]] val = val.rstrip('\n') yield (key, val) last_match = match if last_match: key = last_match.group(1) val = buf[last_match.span()[1]:] val = val.rstrip('\n') yield (key, val) def _path_to_cp_file_profiles(p): # TODO: implement this properly, it's been mostly ad-hoc, eg: # profiles/default/bsd/ChangeLog # profiles/default/bsd/fbsd/amd64/9.1/clang/ChangeLog # So that default/bsd/ChangeLog contains everything EXCEPT the # profiles/default/bsd/fbsd/amd64/9.1/clang stuff. return tuple(p.split('/', 1)) # Convert a repo commit file path to where the changelog should be and what it # should contain. # # Return a tuple of: # - directory for changelog output # - files relative to changelog directory # # @param p path relative to base of repo # @return (dir, file) def path_to_dir_file(p): # Profiles is the longest special case # Mixed ChangeLogs at multiple depths. if p.startswith('profiles/'): return _path_to_cp_file_profiles(p) # Force metadata & scripts to top-level for pfx in ['metadata', 'scripts']: if p.startswith(pfx+'/'): return ('', p) # Eclass & licenses get their own changelogs. for pfx in ['eclass', 'licenses']: if p.startswith(pfx+'/'): p2 = p[len(pfx)+1:] return (pfx, p2) # Everything else SHOULD be one of the following: # skel.ebuild # skel.metadata.xml # header.txt # ${CATEGORY}/metadata.xml # ${CATEGORY}/${PN}/Manifest # ${CATEGORY}/${PN}/metadata.xml # ${CATEGORY}/${PN}/${PF}.ebuild # ${CATEGORY}/${PN}/files/.* # may be multiple directories p_split = p.split('/', 2) #print('split', p_split) if len(p_split) == 1: # Toplevel: return ('', p) elif len(p_split) == 2: # Category-wide goes to top-level # TODO: add other Category-wide files here if p.endswith('metadata.xml'): return ('', p) # Other stuff stays return tuple(p_split) else: # Package cp = '/'.join(p_split[0:2]) return (cp, p_split[2]) # Convert the file portion of git log --name-status -z to changelog style. # # @param null-seperated data about file changes in a single commit. # @return dict of file modifications in changelog style, one key per changelog directory. def process_git_rawfiles_to_changelog(rawfiles_text): changed_raw = rawfiles_text[2:].split('\x00') if changed_raw[-1] is '': del changed_raw[-1] changed = {} # TODO: is this long enough to be worth splitting into parallel function call? for f in zip(*2*[iter(changed_raw)]): _ = path_to_dir_file(f[1]) if len(_) > 2: print('tuple', _) (dirname, filename) = _ if dirname not in changed: changed[dirname] = [] if (f[0] == '' or f[0] is None or f[1] == '' or f[1] is None): writemsg_level( "ERROR: unexpected git file status for %s: %s\n" % (fields['H'],f,), level=logging.ERROR, noiselevel=-1) elif filename == 'Manifest': pass # XXX: remanifest commits? elif filename.startswith('ChangeLog'): pass elif f[0].startswith('A'): changed[dirname].append(_special_filename("+", filename)) elif f[0].startswith('D'): changed[dirname].append(_special_filename("-", filename)) elif f[0].startswith('M'): changed[dirname].append(_special_filename("", filename)) else: writemsg_level( "ERROR: unexpected git file status for %s: %s\n" % (fields['H'],f,), level=logging.ERROR, noiselevel=-1) self.returncode |= 1 for k,v in changed.items(): changed[k] = [str(x) for x in sorted(v)] return changed def process_git_commit_text_to_struct(commit_text): if not commit_text: return None pos = 0 fields = dict(get_commit_fields(commit_text)) if 'RAWFILES' in fields: fields['FILES'] = process_git_rawfiles_to_changelog(fields['RAWFILES']) del fields['RAWFILES'] #(ts, author) = cinfo[0].split(' ', 1) #date = time.strftime('%d %b %Y', time.gmtime(float(ts))) return fields #print('.', end="", flush=True) #return (commit.split(b"\n")[0], ) #return 1 def changelog_header(cp, last_commit): # TODO: consider adding some identifier of when it was updated? # Maybe git commit ID? return textwrap.dedent('''\ # ChangeLog for %s # Copyright 1999-%s Gentoo Foundation; Distributed under the GPL v2 # (auto-generated from git log) ''' % (cp, time.strftime('%Y'))) def process_commit_files(cp, files): # TODO: write this. return files[cp] def process_commit_body(cp, commit): body = commit['B'].split('\n') # strip ': ', '[] ', and similar body[0] = re.sub(r'^\W*' + re.escape(cp) + r'\W+', '', body[0]) # TODO: How should we handle: # - Signed-off-by for non-author/non-committer # - Various bug tags: X-Gentoo-Bug, X-Gentoo-Bug-URL, etc. # - Acked-by: # - Reviewed-by: # - (cherry picked from commit ...) strip_trailing_lines = [ r'^git-svn-id:\s+.*', r'^\s*\(Signed Manifest commit\)$', r'^\s*\(Unsigned Manifest commit\)$', r'^\(Portage version:\s+.*\)$', r'^Package-Manager:\s+.*', r'^RepoMan-Options:\s+.*', r'^$', ] # If the author is the committer, then we don't need the signed-off-by, as # it will already be included. if commit['cE'] == commit['aE']: strip_trailing_lines.append(r'^Signed-off-by:\s+.*' + re.escape(commit['cE']) + r'.*') bugs = [] while len(body) > 0: changed_anything = False for pat in strip_trailing_lines: if re.search(pat, body[-1], flags=re.IGNORECASE): changed_anything = True body[-1] = re.sub(pat, '', body[-1], flags=re.IGNORECASE) if len(body[-1]) == 0: del body[-1] if len(body) == 0: break if not changed_anything: break return body def write_package_changelog(cp, cp_commits, options={}): result = 'FAILED' # Open wrapper _wrapper = textwrap.TextWrapper( width = 78, initial_indent = ' ', subsequent_indent = ' ', ) # Generate commit entries if options['newest-first'] == False: cp_commits = reversed(cp_commits) package_deleted = False changelog_entries = [] package_files = dict() # This MUST go oldest to newest, so that we can detect package deletion. for c in cp_commits: sio = io.StringIO() # TODO: detect package deletion. changed = process_commit_files(cp, c['FILES']) author = '{} <{}>'.format(c['aN'], c['aE']) committer = '{} <{}>'.format(c['cN'], c['cE']) date = time.strftime('%d %b %Y', time.gmtime(float(c['ct']))) for fn in changed: if fn.startswith('+'): fnb = fn[1:] if fnb not in package_files: package_files[fnb] = [] package_files[fnb].append(c['H']) #raise KeyError('{}: {} already tracked, cannot add, commit={} vs {}!'.format(cp, fnb, c['H'], package_files[fnb])) elif fn.startswith('-'): fnb = fn[1:] if fnb in package_files: del package_files[fnb] else: pass #raise KeyError('{}: {} not already tracked, cannot delete, commit={}!'.format(cp, fnb, c['H'])) else: if fn not in package_files: #raise KeyError('{}: {} not already tracked, cannot modify, commit={}!'.format(cp, fn, c['H'])) package_files[fn] = [] package_files[fn].append(c['H']) # If all files were deleted, then the package was removed. if len(package_files) == 0: package_deleted = True # If we try to exclude at a higher level # We run the risk that an excluded commit deletes the package, and # then we write and orphaned ChangeLog. if c['H'] in EXCLUDE_COMMITS: continue # Write new version bumps: # *saaj-api-1.3-r3 (09 Aug 2015) wroteheader = False for fn in changed: if fn.startswith('+') and fn.endswith('.ebuild'): sio.write('*%s (%s)\n' % (fn[1:-7], date)) wroteheader = True if wroteheader: sio.write('\n') # Add the short hash to each change for tracing. # NOTE: this is a change from CVS behavior. date += ' ' + c['h'] # DO NOT break on hypens in filenames. _wrapper.break_on_hyphens = False sio.write(_wrapper.fill( '%s; %s %s:' % (date, committer, ', '.join(changed)))) # DO break on hypens in text _wrapper.break_on_hyphens = True body = process_commit_body(cp, c) # If the author was not the committer, credit the author. # NOTE: this is a change from CVS behavior. if (author != committer): body[0] = '(author {}) '.format(author) + body[0] sio.write('\n%s\n\n' % '\n'.join(_wrapper.fill(x) for x in body)) changelog_entries.append(sio.getvalue()) # Build filenames tempfile_prefix = '.tmp-ChangeLog:{}:'.format(cp.replace('/',':')) destdir = os.path.join(options['destdir'], cp) changelog_file = os.path.join(destdir, 'ChangeLog-{}'.format(time.strftime('%Y'))) changelog_symlink = os.path.join(destdir, 'ChangeLog') # If the package was deleted and not brought back, we should NOT write # a changelog to disk. if package_deleted: if os.path.exists(changelog_file): os.unlink(changelog_file) if os.path.exists(changelog_symlink): os.unlink(changelog_symlink) result = 'DELETED' return '{} {}'.format(cp, result) # Open file if len(destdir) > 0: os.makedirs(destdir, exist_ok=True) with tempfile.NamedTemporaryFile( buffering=64*1024, prefix=tempfile_prefix, mode='w', encoding=_encodings['repo.content'], dir=destdir, delete=False, ) as output: # Write header output.write(changelog_header(cp, 'TODO-COMMIT')) if options['newest-first'] == False: changelog_entries = reversed(changelog_entries) for s in changelog_entries: output.write(s) # Close file if isinstance(output, tempfile.SpooledTemporaryFile): output.rollover() # Force to disk #print('cp', cp, 'output.name', output.name, file=sys.stderr) src = os.path.join(cp, output.name) if os.path.exists(changelog_file): os.unlink(changelog_file) os.replace(src, changelog_file) if os.path.lexists(changelog_symlink) or os.path.exists(changelog_symlink): os.unlink(changelog_symlink) os.symlink(os.path.basename(changelog_file), changelog_symlink) result = 'GENERATED' # Be really explicit about it output.close() return '{} {}'.format(cp, result) options = { 'destdir': 'changelog-gen/', 'newest-first': False, } def starmap_func_write_package_changelog(cp, cp_commits): return write_package_changelog(cp, cp_commits, options=options) def main(): parallel = True parser = argparse.ArgumentParser(description='Generate ChangeLog for repo/gentoo.git (or other similarly structured trees)') parser.add_argument('--destdir', metavar='DIR', type=str, required=True, help='Destination directory for output') parser.add_argument('--jobs', metavar='N', type=str, default='max', help='Number of parallel workers to use, "max" -> all cpus [%(default)s]') parser.add_argument('--progress', action='count', default=0, help='Display progress output (number of commits & packages processed)') parser.add_argument('--verbose', action='count', default=0, help='Display verbose output') parser.add_argument('--debug', action='count', default=0, help='Display debug output') parser.add_argument('--write-git-log-data', type=argparse.FileType('wb'), help='Write git-log data to specified file') parser.add_argument('--read-git-log-data', type=argparse.FileType('rb'), help='Read git-log data from specified file (INSTEAD of running git-log)') g1 = parser.add_mutually_exclusive_group() g1.add_argument('--delete-git-log-data', dest='delete_git_log_data', action='store_true', help='Delete git-log data file after completion (default yes if not reading)') g1.add_argument('--no-delete-git-log-data', dest='delete_git_log_data', action='store_false', help='Do not delete git-log data file after completion') parser.add_argument('--max-count', type=int, default=None, help=argparse.SUPPRESS) # Max number of commits to request (only for debugging) opts = parser.parse_args() if opts.jobs == 'max': opts.jobs = multiprocessing.cpu_count() elif re.match(r'^\d+$', opts.jobs) and int(opts.jobs) > 0: opts.jobs = int(opts.jobs) else: ArgumentParser.error('Jobs must be "max" or a positive integer.') # if --delete-git-log-data is not set # set to true if --read-git-log-data is NOT set if opts.delete_git_log_data is None: opts.delete_git_log_data = (opts.read_git_log_data is None) if opts.write_git_log_data is None: tmpopts = { 'prefix':'egenchangelog2-git-log-', 'suffix': '.bin' } if opts.delete_git_log_data is True: opts.write_git_log_data = tempfile.SpooledTemporaryFile(max_size=(128 * 2 << 20), **tmpopts) else: opts.write_git_log_data = tempfile.NamedTemporaryFile(delete=False, **tmpopts) if opts.jobs > 1: # WARNING: all input functions to these MUST be pickable! # Lambdas or local functions will NOT work. opts.p = multiprocessing.Pool(processes=opts.jobs) opts.imap_ordered = opts.p.imap opts.imap_unordered = opts.p.imap_unordered opts.starmap = opts.p.starmap else: opts.p = None opts.imap_ordered = map opts.imap_unordered = map opts.starmap = itertools.starmap print(opts) if opts.progress > 0 or opts.verbose > 0: print("Querying git-log.", flush=True) buf = get_commits_buffer(opts=opts) if opts.progress > 0 or opts.verbose > 0: print("git-log gave us {} bytes".format(len(buf)), flush=True) if opts.progress > 0 or opts.verbose > 0: print("Evaluating commits:", flush=True) commits1 = CommitGenerator(buf, opts=opts) #for n,c in enumerate(commits1): # print(n, c) #return slicer = itertools.chain(commits1) # Force lazy eval commits2 = opts.imap_ordered(process_git_commit_text_to_struct, slicer) changed_packages = dict() for n, c in enumerate(commits2): # Skip bad output (should NOT happen) if not c or c.get('H', None) is None: # TODO: error output continue # BAD: Skip excluded commits # BAD: we must do it at a lower level, look for the other usage of # EXCLUDE_COMMITS for the detailed comment. #if c['H'] in EXCLUDE_COMMITS: # continue if opts.progress > 0: print(n, 'H={}'.format(c['H'])) # TODO: Stash commits to SQLite for cp in c['FILES'].keys(): if cp not in changed_packages: changed_packages[cp] = [] changed_packages[cp].append(c) if opts.progress > 0 or opts.verbose > 0: print("Done {} commits. Saw {} total packages.".format(n, len(changed_packages)), flush=True) if opts.progress > 0 or opts.verbose > 0: print("Writing ChangeLogs:", flush=True) commits3 = opts.starmap(starmap_func_write_package_changelog, changed_packages.items()) # This must be done because otherwise only lazy evaluation happens. for n,cp in enumerate(commits3): if opts.progress > 0: print(n, cp) if opts.progress > 0 or opts.verbose > 0: print("Done writing ChangeLogs.", flush=True) if __name__ == '__main__': main()