summaryrefslogtreecommitdiff
path: root/pym
diff options
context:
space:
mode:
authorZac Medico <zmedico@gentoo.org>2008-06-09 13:55:06 +0000
committerZac Medico <zmedico@gentoo.org>2008-06-09 13:55:06 +0000
commit0c8fc90cb2f0101f60a78953668b391e44d3cdca (patch)
treee1420607f8c3ac05f28f7bb05714aa825566658f /pym
parentWhen inside fakeroot, directories with portage's gid appear (diff)
downloadportage-idfetch-0c8fc90cb2f0101f60a78953668b391e44d3cdca.tar.gz
portage-idfetch-0c8fc90cb2f0101f60a78953668b391e44d3cdca.tar.bz2
portage-idfetch-0c8fc90cb2f0101f60a78953668b391e44d3cdca.zip
Add CONTENTS indexing support for optimization of owner lookups. The
vardbapi cache maintains a hash table (inside vdb_metadata.pickle) that serves to index package contents by mapping the basename of file to a list of possible packages that own it. This is used to optimize owner lookups by narrowing the search down to a smaller number of packages. It increases the size of vdb_metadata.pickle by approximately 30% and it's used in the following cases: * When an unexpected file collision occurs (whether or not collision-protect is enabled) * `emerge <filename>` * `portageq owners` The svn path=/main/trunk/; revision=10609
Diffstat (limited to 'pym')
-rw-r--r--pym/_emerge/__init__.py15
-rw-r--r--pym/portage/dbapi/vartree.py293
2 files changed, 249 insertions, 59 deletions
diff --git a/pym/_emerge/__init__.py b/pym/_emerge/__init__.py
index 520a8610..68f976cb 100644
--- a/pym/_emerge/__init__.py
+++ b/pym/_emerge/__init__.py
@@ -2560,6 +2560,7 @@ class depgraph(object):
myroot = self.target_root
dbs = self._filtered_trees[myroot]["dbs"]
vardb = self.trees[myroot]["vartree"].dbapi
+ real_vardb = self._trees_orig[myroot]["vartree"].dbapi
portdb = self.trees[myroot]["porttree"].dbapi
bindb = self.trees[myroot]["bintree"].dbapi
pkgsettings = self.pkgsettings[myroot]
@@ -2638,16 +2639,12 @@ class depgraph(object):
" $ROOT.\n") % x, noiselevel=-1)
return 0, []
relative_path = x[len(myroot):]
- vartree = self._trees_orig[myroot]["vartree"]
owner_cpv = None
- for cpv in vardb.cpv_all():
- self.spinner.update()
- cat, pf = portage.catsplit(cpv)
- if portage.dblink(cat, pf, myroot,
- pkgsettings, vartree=vartree).isowner(
- relative_path, myroot):
- owner_cpv = cpv
- break
+ for pkg, relative_path in \
+ real_vardb._owners.iter_owners([relative_path]):
+ owner_cpv = pkg.mycpv
+ break
+
if owner_cpv is None:
portage.writemsg(("\n\n!!! '%s' is not claimed " + \
"by any package.\n") % x, noiselevel=-1)
diff --git a/pym/portage/dbapi/vartree.py b/pym/portage/dbapi/vartree.py
index f2593be6..fd28a670 100644
--- a/pym/portage/dbapi/vartree.py
+++ b/pym/portage/dbapi/vartree.py
@@ -238,6 +238,9 @@ class vardbapi(dbapi):
_excluded_dirs = re.compile(r'^(\..*|-MERGING-.*|' + \
"|".join(_excluded_dirs) + r')$')
+ _aux_cache_version = "1"
+ _owners_cache_version = "1"
+
# Number of uncached packages to trigger cache update, since
# it's wasteful to update it for every vdb change.
_aux_cache_threshold = 5
@@ -275,8 +278,7 @@ class vardbapi(dbapi):
"EAPI", "HOMEPAGE", "IUSE", "KEYWORDS",
"LICENSE", "PDEPEND", "PROVIDE", "RDEPEND",
"repository", "RESTRICT" , "SLOT", "USE"])
- self._aux_cache = None
- self._aux_cache_version = "1"
+ self._aux_cache_obj = None
self._aux_cache_filename = os.path.join(self.root,
CACHE_PATH.lstrip(os.path.sep), "vdb_metadata.pickle")
self._counter_path = os.path.join(root,
@@ -290,6 +292,7 @@ class vardbapi(dbapi):
self.plib_registry = None
self.linkmap = LinkageMap(self)
+ self._owners = self._owners_db(self)
def getpath(self, mykey, filename=None):
rValue = os.path.join(self.root, VDB_PATH, mykey)
@@ -562,6 +565,7 @@ class vardbapi(dbapi):
if self._aux_cache is not None and \
len(self._aux_cache["modified"]) >= self._aux_cache_threshold and \
secpass >= 2:
+ self._owners.populate() # index any unindexed contents
valid_nodes = set(self.cpv_all())
for cpv in self._aux_cache["packages"].keys():
if cpv not in valid_nodes:
@@ -577,6 +581,54 @@ class vardbapi(dbapi):
pass
self._aux_cache["modified"] = set()
+ @property
+ def _aux_cache(self):
+ if self._aux_cache_obj is None:
+ self._aux_cache_init()
+ return self._aux_cache_obj
+
+ def _aux_cache_init(self):
+ try:
+ f = open(self._aux_cache_filename)
+ mypickle = cPickle.Unpickler(f)
+ mypickle.find_global = None
+ aux_cache = mypickle.load()
+ f.close()
+ del f
+ except (IOError, OSError, EOFError, cPickle.UnpicklingError), e:
+ if isinstance(e, cPickle.UnpicklingError):
+ writemsg("!!! Error loading '%s': %s\n" % \
+ (self._aux_cache_filename, str(e)), noiselevel=-1)
+ del e
+
+ if not aux_cache or \
+ not isinstance(aux_cache, dict) or \
+ aux_cache.get("version") != self._aux_cache_version or \
+ not aux_cache.get("packages"):
+ aux_cache = {"version": self._aux_cache_version}
+ aux_cache["packages"] = {}
+
+ owners = aux_cache.get("owners")
+ if owners is not None:
+ if not isinstance(owners, dict):
+ owners = None
+ elif "version" not in owners:
+ owners = None
+ elif owners["version"] != self._owners_cache_version:
+ owners = None
+ elif "base_names" not in owners:
+ owners = None
+
+ if owners is None:
+ owners = {
+ "base_names" : {},
+ "version" : self._owners_cache_version
+ }
+ aux_cache["owners"] = owners
+
+ aux_cache["modified"] = set()
+ self._aux_cache_obj = aux_cache
+
def aux_get(self, mycpv, wants):
"""This automatically caches selected keys that are frequently needed
by emerge for dependency calculations. The cached metadata is
@@ -601,26 +653,6 @@ class vardbapi(dbapi):
cache_these = set(self._aux_cache_keys)
cache_these.update(cache_these_wants)
- if self._aux_cache is None:
- try:
- f = open(self._aux_cache_filename)
- mypickle = cPickle.Unpickler(f)
- mypickle.find_global = None
- self._aux_cache = mypickle.load()
- f.close()
- del f
- except (IOError, OSError, EOFError, cPickle.UnpicklingError), e:
- if isinstance(e, cPickle.UnpicklingError):
- writemsg("!!! Error loading '%s': %s\n" % \
- (self._aux_cache_filename, str(e)), noiselevel=-1)
- del e
- if not self._aux_cache or \
- not isinstance(self._aux_cache, dict) or \
- self._aux_cache.get("version") != self._aux_cache_version or \
- not self._aux_cache.get("packages"):
- self._aux_cache = {"version": self._aux_cache_version}
- self._aux_cache["packages"] = {}
- self._aux_cache["modified"] = set()
mydir = self.getpath(mycpv)
mydir_stat = None
try:
@@ -801,6 +833,173 @@ class vardbapi(dbapi):
write_atomic(self._counter_path, str(counter))
return counter
+ def _dblink(self, cpv):
+ category, pf = catsplit(cpv)
+ return dblink(category, pf, self.root,
+ self.settings, vartree=self.vartree)
+
+ class _owners_cache(object):
+ """
+ This class maintains an hash table that serves to index package
+ contents by mapping the basename of file to a list of possible
+ packages that own it. This is used to optimize owner lookups
+ by narrowing the search down to a smaller number of packages.
+ """
+ try:
+ from hashlib import md5 as _new_hash
+ except ImportError:
+ from md5 import new as _new_hash
+
+ _hash_bits = 16
+ _hex_chars = _hash_bits / 4
+
+ def __init__(self, vardb):
+ self._vardb = vardb
+
+ def add(self, cpv):
+ root_len = len(self._vardb.root)
+ contents = self._vardb._dblink(cpv).getcontents()
+ pkg_hash = self._hash_pkg(cpv)
+ if not contents:
+ # Empty path is a code used to represent empty contents.
+ self._add_path("", pkg_hash)
+ for x in contents:
+ relative_path = x[root_len:]
+ self._add_path(x, pkg_hash)
+ self._vardb._aux_cache["modified"].add(cpv)
+
+ def _add_path(self, path, pkg_hash):
+ """
+ Empty path is a code that represents empty contents.
+ """
+ if path:
+ name = os.path.basename(path.rstrip(os.path.sep))
+ if not name:
+ return
+ else:
+ name = path
+ name_hash = self._hash_str(name)
+ base_names = self._vardb._aux_cache["owners"]["base_names"]
+ pkgs = base_names.get(name_hash)
+ if pkgs is None:
+ pkgs = {}
+ base_names[name_hash] = pkgs
+ pkgs[pkg_hash] = None
+
+ def _hash_str(self, s):
+ h = self._new_hash()
+ h.update(s)
+ h = h.hexdigest()
+ h = h[-self._hex_chars:]
+ h = int(h, 16)
+ return h
+
+ def _hash_pkg(self, cpv):
+ counter, mtime = self._vardb.aux_get(
+ cpv, ["COUNTER", "_mtime_"])
+ try:
+ counter = int(counter)
+ except ValueError:
+ counter = 0
+ return (cpv, counter, mtime)
+
+ class _owners_db(object):
+
+ def __init__(self, vardb):
+ self._vardb = vardb
+
+ def populate(self):
+ self._populate()
+
+ def _populate(self):
+ owners_cache = vardbapi._owners_cache(self._vardb)
+ cached_hashes = set()
+ base_names = self._vardb._aux_cache["owners"]["base_names"]
+
+ # Take inventory of all cached package hashes.
+ for hash_values in base_names.itervalues():
+ cached_hashes.update(hash_values)
+
+ # Create sets of valid package hashes and uncached packages.
+ uncached_pkgs = set()
+ hash_pkg = owners_cache._hash_pkg
+ valid_pkg_hashes = set()
+ for cpv in self._vardb.cpv_all():
+ hash_value = hash_pkg(cpv)
+ valid_pkg_hashes.add(hash_value)
+ if hash_value not in cached_hashes:
+ uncached_pkgs.add(cpv)
+
+ # Cache any missing packages.
+ for cpv in uncached_pkgs:
+ owners_cache.add(cpv)
+
+ # Delete any stale cache.
+ stale_hashes = cached_hashes.difference(valid_pkg_hashes)
+ if stale_hashes:
+ for base_name_hash, bucket in base_names.items():
+ for hash_value in stale_hashes.intersection(bucket):
+ del bucket[hash_value]
+ if not bucket:
+ del base_names[base_name_hash]
+
+ return owners_cache
+
+ def get_owners(self, path_iter):
+ """
+ @return the owners as a dblink -> set(files) mapping.
+ """
+ owners = {}
+ for owner, f in self.iter_owners(path_iter):
+ owned_files = owners.get(owner)
+ if owned_files is None:
+ owned_files = set()
+ owners[owner] = owned_files
+ owned_files.add(f)
+ return owners
+
+ def iter_owners(self, path_iter):
+ """
+ Iterate over tuples of (dblink, path). In order to avoid
+ consuming too many resources for too much time, resources
+ are only allocated for the duration of a given iter_owners()
+ call. Therefore, to maximize reuse of resources when searching
+ for multiple files, it's best to search for them all in a single
+ call.
+ """
+
+ owners_cache = self._populate()
+
+ vardb = self._vardb
+ root = vardb.root
+ hash_pkg = owners_cache._hash_pkg
+ hash_str = owners_cache._hash_str
+ base_names = self._vardb._aux_cache["owners"]["base_names"]
+
+ dblink_cache = {}
+
+ def dblink(cpv):
+ x = dblink_cache.get(cpv)
+ if x is None:
+ x = self._vardb._dblink(cpv)
+ dblink_cache[cpv] = x
+ return x
+
+ for path in path_iter:
+ name = os.path.basename(path.rstrip(os.path.sep))
+ if not name:
+ continue
+
+ name_hash = hash_str(name)
+ pkgs = base_names.get(name_hash)
+ if pkgs is not None:
+ for hash_value in pkgs:
+ cpv, counter, mtime = hash_value
+ if hash_pkg(cpv) != hash_value:
+ continue
+ if dblink(cpv).isowner(path, root):
+ yield dblink(cpv), path
+
class vartree(object):
"this tree will scan a var/db/pkg database located at root (passed to init)"
def __init__(self, root="/", virtual=None, clone=None, categories=None,
@@ -2202,36 +2401,30 @@ class dblink(object):
eerror(msg)
- if collision_protect:
+ msg = []
+ msg.append("")
+ msg.append("Searching all installed" + \
+ " packages for file collisions...")
+ msg.append("")
+ msg.append("Press Ctrl-C to Stop")
+ msg.append("")
+ eerror(msg)
+
+ owners = self.vartree.dbapi._owners.get_owners(files)
+ self.vartree.dbapi.flush_cache()
+
+ for pkg, owned_files in owners.iteritems():
+ cpv = pkg.mycpv
msg = []
- msg.append("")
- msg.append("Searching all installed" + \
- " packages for file collisions...")
- msg.append("")
- msg.append("Press Ctrl-C to Stop")
- msg.append("")
+ msg.append("%s" % cpv)
+ for f in sorted(owned_files):
+ msg.append("\t%s" % os.path.join(destroot,
+ f.lstrip(os.path.sep)))
eerror(msg)
-
- found_owner = False
- for cpv in self.vartree.dbapi.cpv_all():
- cat, pkg = catsplit(cpv)
- mylink = dblink(cat, pkg, destroot, self.settings,
- vartree=self.vartree)
- mycollisions = []
- for f in collisions:
- if mylink.isowner(f, destroot):
- mycollisions.append(f)
- if mycollisions:
- found_owner = True
- msg = []
- msg.append("%s" % cpv)
- for f in mycollisions:
- msg.append("\t%s" % os.path.join(destroot,
- f.lstrip(os.path.sep)))
- eerror(msg)
- if not found_owner:
- eerror(["None of the installed" + \
- " packages claim the file(s)."])
+ if not owners:
+ eerror(["None of the installed" + \
+ " packages claim the file(s)."])
+ if collision_protect:
return 1
writemsg_stdout(">>> Merging %s to %s\n" % (self.mycpv, destroot))