1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
319
320
321
322
323
324
325
326
327
328
329
330
331
332
333
334
335
336
337
338
339
340
341
342
343
344
345
346
347
348
349
350
351
352
353
354
355
356
357
358
359
360
361
362
363
364
365
366
367
368
369
370
371
372
373
374
375
376
377
378
379
380
381
382
383
384
385
386
387
388
389
390
391
392
393
394
395
396
397
398
399
400
401
402
403
404
405
406
407
408
409
410
411
412
413
414
415
416
417
418
419
420
421
422
423
424
425
426
427
428
429
430
431
432
433
434
435
436
437
438
439
440
441
442
443
444
445
446
447
448
449
450
451
452
453
454
455
456
457
458
459
460
461
462
463
464
465
466
467
468
469
470
471
472
473
474
475
476
477
478
479
480
481
482
483
484
485
486
487
488
489
490
491
492
493
494
495
496
497
498
499
500
501
502
503
|
### Copyright 2004, Xavier Neys (neysx@gentoo.org)
# #
# # This file is part of gorg.
# #
# # gorg is free software; you can redistribute it and/or modify
# # it under the terms of the GNU General Public License as published by
# # the Free Software Foundation; either version 2 of the License, or
# # (at your option) any later version.
# #
# # gorg is distributed in the hope that it will be useful,
# # but WITHOUT ANY WARRANTY; without even the implied warranty of
# # MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
# # GNU General Public License for more details.
# #
# # You should have received a copy of the GNU General Public License
# # along with Foobar; if not, write to the Free Software
### Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA 02111-1307 USA
# Cache a bit of data based on
# . a path name as received by a webserver e.g.
# . a list of parameters as received by a webserver e.g.
# . a list of files it depends on
begin
require "parsedate"
$haveparsedate = true
rescue LoadError
require "time"
$haveparsedate = false
end
require "fileutils"
require "find"
require "digest"
require "digest/md5"
module Gorg
CacheStamp = "Gorg-#{Gorg::Version} Cached This Data. Do not alter this file. Thanks."
module Cache
def Cache.init(config)
@@lockfile = ".cache.cleaner.lock"
@cacheDir = nil
if FileTest.directory?(config["cacheDir"])
if FileTest.writable?(config["cacheDir"])
@cacheDir = config["cacheDir"].chomp("/")
else
warn "Cache directory not writable"
end
else
warn "Invalid cache directory"
end
# Time-To-Live in seconds, cached items older than that will be considered too old
@zipLevel = config["zipLevel"]
@zip = @zipLevel > 0 ? ".gz" : ""
@ttl = config["cacheTTL"]
@cacheTree = config["cacheTree"]
@maxFiles = config["maxFiles"] # Max number of files in a single directory
@maxSize = config["cacheSize"]*1024*1024 # Now in bytes
@washNumber = config["cacheWash"] # Clean cache dir after a store operation whenever rand(@washNumber) < 10
@lastCleanup = Time.new-8e8 # Remember last time we started a cleanup so we don't pile them up
end
def Cache.hit(objPath, objParam={}, etags=nil, ifmodsince=nil)
# objPath is typically a requested path passed from a web request but it
# can be just any string. It is not checked against any actual files on the file system
#
# objParam is expected to be a hash or any object whose iterator yields two values
#
# 2 filenames are built with the arguments and should give
# the name of a metafile and a result file
# if the result file is older than @ttl seconds, hit fails
# The metafile is then checked for dependencies
# It contains a list of filenames along with their size and mtime separated by ;;
# etag and ifmodsince are used in a webserver context
# etag is defined if an ETag was part of an If-None-Match request field
# etag can be an array or a single string
# If the current ETag of the meta file matches, no data is returned (webserver should return a 304)
#
# ifmodsince is a time object passed on an If-Modified-Since request field
# If the creation date of the meta file is earlier, no data is returned (webserver should return a 304)
return nil if @cacheDir.nil? # Not initialized, ignore request
# Reminder: filenames are full path, no need to prepend dirname
dirname, basename, filename, metaname = makeNames(objPath, objParam)
raise "Cache subdir does not exist" unless FileTest.directory?(dirname)
# Hit the cache
meta, mstat = IO.read(metaname), File.stat(metaname) if metaname && FileTest.file?(metaname) && FileTest.readable?(metaname)
raise "Empty/No meta file" if meta.nil? || meta.length < 1
fstat = File.stat(filename) if filename && FileTest.file?(filename)
raise "Empty/No data file" if fstat.nil?
# Check the timestamps of files in the metadata
meta = meta.split("\n")
raise "I did not write that meta file" unless CacheStamp == meta.shift
mline = meta.shift
while mline and mline !~ /^;;extra meta$/ do
f, s, d = mline.split(";;")
if s.to_i < 0
# File did not exist when cache entry was created
raise "Required file #{f} has (re)appeared" if FileTest.file?(f) && FileTest.readable?(f)
else
# File did exist when cache entry was created, is it still there?
raise "Required file #{f} has disappeared" unless FileTest.file?(f) && FileTest.readable?(f)
fst = File.stat(f)
raise "Size of #{f} has changed from #{fst.size} to #{s.to_i}" unless fst.size == s.to_i
if $haveparsedate
raise "Timestamp of #{f} has changed" unless Time.utc(*ParseDate.parsedate(d)) == fst.mtime.utc
else
raise "Timestamp of #{f} has changed" unless Time.parse(d) == fst.mtime.utc
end
end
mline = meta.shift
end
if mline =~ /^;;extra meta$/ then
extrameta = meta.dup
else
extrameta = []
end
if notModified?(fstat, etags, ifmodsince) and extrameta.join !~ /set-cookie/i
raise Gorg::Status::NotModified.new(fstat)
end
file = IO.read(filename) if filename && FileTest.file?(filename) && FileTest.readable?(filename)
raise "Empty/No data file" if file.nil? || file.length < 1
# Is the data file too old
raise "Data file too old" unless @ttl==0 or (Time.new - fstat.mtime) < @ttl
# Update atime of files, ignore failures as files might have just been removed
begin
t = Time.new
File.utime(t, fstat.mtime, filename)
File.utime(t, mstat.mtime, metaname)
rescue
nil
end
# If we get here, it means the data file can be used, return cache object (data, stat(datafile), extrameta)
# The file is left (un)compressed, it's returned as it was stored
[file, fstat, extrameta]
rescue Gorg::Status::NotModified
# Nothing changed, should return a 304
debug("Client cache is up-to-date")
raise
rescue
# cache hit fails if anything goes wrong, no exception raised
debug("Cache hit on #{objPath} failed: (#{$!})")
nil
end
def Cache.store(data, objPath, objParam={}, deps=[], extrameta=[])
# Store data in cache so it can be retrieved based on the objPath and objParams
# deps should contain a list of files that the object depends on
# as returnd by our xsl processor, i.e. an array of [access_type, path] where
# access_type can be "r", "w", or "o" for recpectively read, write, other.
# Define content-type
ct = setContentType(data)
extrameta << "Content-Type:#{ct}"
return nil if @cacheDir.nil? # Not initialized, ignore request
# Cache only if no remote objects (ftp:// or http://) in list of used files
if deps && deps.detect{|f| f[0] =~ /^o$/i }
debug "#{objPath} not cached because it needs remote resources"
return nil
end
dirname, basename, filename, metaname = makeNames(objPath, objParam)
FileUtils.mkdir_p(dirname) unless FileTest.directory?(dirname)
# Write Meta file to a temp file (with .timestamp.randomNumber appended)
metaname_t = "#{metaname}.#{Time.new.strftime('%Y%m%d%H%M%S')}.#{rand(9999)}"
# Data might need to be just a link to another .Data file
# if we find another requested path with different params but
# with identical MD5 sums
# Which is why we keep a ...xml.Data.[md5 sum] file without the parameters
# in its name that we can hard link to.
# e.g. A moron hits for 10 full handbooks with toto=1..10 in the URI,
# we'd end up with 10 identical large copies. With links we have only one
# Old versions are expected to be cleaned up by the cacheWash() routine
# A Dir.glob() to find the previous ones would be too expensive
# Compute MD5 digest
md5 = Digest::MD5.hexdigest(data)
# Compress data if required
if @zipLevel > 0 then
bodyZ = data = gzip(data, @zipLevel)
else
bodyZ = nil
end
# Set mtime of data file to latest mtime of all required files
# so that caching can work better because mtimes will be
# identical on all webnodes whereas creation date of data
# would be different on all nodes.
maxmtime = Time.now-8e8
fstat = nil
begin
timeout(10){
File.open("#{metaname_t}", "w") {|fmeta|
fmeta.puts(CacheStamp)
# Write filename;;size;;mtime for each file in deps[]
deps.each {|ffe|
ftype = ffe[0]
fdep = ffe[1]
if FileTest.file?(fdep)
s = File.stat(fdep)
fmeta.puts("#{fdep};;#{s.size};;#{s.mtime.utc};;#{ftype}")
maxmtime = s.mtime if s.mtime > maxmtime and ftype =~ /^r$/i
else
# A required file does not exist, use size=-1 and old timestamp
# so that when the file comes back, the cache notices a difference
# and no cache miss gets triggered as long as file does not exist
fmeta.puts("#{fdep};;-1;;Thu Nov 11 11:11:11 UTC 1971")
end
}
fmeta.puts ";;extra meta"
extrameta.each { |m| fmeta.puts m }
}
# Get exclusive access to the cache directory while moving files and/or creating data files
File.open(dirname) { |lockd|
while not lockd.flock(File::LOCK_NB|File::LOCK_EX)
# Timeout does not occur on a blocking lock
# Try a non-bloking one repeatedly for a few seconds until timeout occurs or lock is granted
# We are in a timeout block, remember
sleep 0.1
end
# Remove previous Data
FileUtils.rm_rf(filename)
# mv temp meta file to meta file
FileUtils.mv(metaname_t, metaname)
# We keep a data file for the same requested path, with different params,
# but which ends up with same MD5 sum, i.e. identical results because of unused params
linkname = "#{basename}.#{md5}#{@zip}"
if FileTest.file?(linkname) then
# Data file already there, link to it
File.link(linkname, filename)
else
# Write data file and set its mtime to latest of all files it depends on
File.open("#{filename}", "w") {|fdata| fdata.write(data)}
# Create link
File.link(filename, linkname)
end
# mtime might need to be updated, or needs to be set
# e.g. when a dependency had changed but result files is identical
# This is needed to keep Last-Modified dates consistent across web nodes
File.utime(Time.now, maxmtime, filename)
fstat = File.stat(filename)
}
}
ensure
FileUtils.rm_rf(metaname_t)
end
# Do we clean the cache?
washCache(dirname, 10) if @washNumber > 0 and rand(@washNumber) < 10
# Return stat(datafile) even if it's just been removed by washCache
# because another web node might still have it or will have it.
# Anyway, the cached item would be regenerated on a later request
# and a 304 would be returned if still appropriate at the time.
# Return fstat of data file (for etag...) and zipped file
[fstat, bodyZ]
rescue Timeout::Error, StandardError =>ex
if ex.class.to_s =~ /timeout::error/i then
warn("Timeout in cache store operation")
else
warn("Cache store error (#{$!})")
end
# Clean up before leaving
FileUtils.rm_rf(filename||"")
FileUtils.rm_rf(metaname||"")
nil # return nil so that caller can act if a failed store really is a problem
end
def Cache.washCache(dirname, tmout=30, cleanTree=false)
# Clean cache entries that are either too old compared to TTL (in seconds)
# or reduce total size to maxSize (in MB)
# oldDataOnly means to look only for unused *.Data.[md5] files that are not used anymore
# because file has been modified and has generated a new *.Data.[md5] file
# timeout is the maximum time (in seconds) spent in here
return nil if @cacheDir.nil? # Not initialized, ignore request
# Also ignore request if dirname not equal to @cacheDir or under it
return nil unless dirname[0, @cacheDir.length] == @cacheDir
# Also ignore request if dirname does not exist yet
return nil unless FileTest.directory?(dirname)
# Also return if less than a minute has elapsed since latest cleanup
t0 = Time.new
return nil if t0 - @lastCleanup < 60
# Remember for next time
@lastCleanup = t0
Dir.chdir(dirname) { |d|
# Recreate lock file if it's been lost
unless File.exist?(@@lockfile)
File.open(@@lockfile, "w") { |lockf| lockf.puts("Lock file created on #{Time.now.utc} by gorg")}
end
# Grab lockfile
File.open(@@lockfile) { |lockf|
if lockf.flock(File::LOCK_NB|File::LOCK_EX) then
infoMsg = "Cleaning up cache in #{dirname} (cleanTree=#{cleanTree}, tmout=#{tmout})"
info(infoMsg)
puts infoMsg if cleanTree
timeout(tmout) {
totalSize, deletedFiles, scannedDirectories = washDir(dirname, cleanTree)
if totalSize >= 0 then
# Size == -1 means dir was locked, throwing an exception would have been nice :)
infoMsg = if cleanTree then
"Cache in #{dirname} is now #{totalSize/1024/1024} MB, #{deletedFiles} files removed in #{(Time.now-t0).to_i} seconds in #{scannedDirectories} directories"
else
"#{deletedFiles} files removed in #{(Time.now-t0).to_i} seconds in #{dirname}"
end
info(infoMsg)
puts infoMsg if cleanTree
end
}
else
# Locked dir, another process is busy cleaning up/
debug("#{dirname} locked, skipping")
puts("#{dirname} locked, skipping") if cleanTree
end # of lock test
} # end of File.open(@@lockfile), close & release lock automatically
}
rescue Timeout::Error
info("Timeout while cleaning #{dirname}")
puts("Timeout while cleaning #{dirname}") if cleanTree
rescue StandardError =>ex
error("Error while cleaning cache: #{ex}")
puts("Error while cleaning cache: #{ex}") if cleanTree
end
private
def Cache.washDir(dirname, cleanTree)
# Clean up cache starting from dirname and in subdirectories if cleanTree is true
# Return [newSize in bytes, # deleted files, # scanned directories]
size = nDeleted = nDirectories = 0
Dir.chdir(dirname) { |d|
hIno = Hash.new(0) # hash of file inodes with more than one link
lst = Array.new # array of file names, atime, ...
ttl = @ttl
ttl = 8e8 if ttl == 0 # No ttl, keep very old docs!
# Get list of files sorted on their dirname+atime
Find.find('.') { |f|
begin
unless f =~ /^\.$|#{@@lockfile}/ # ignore "." and lockfile
ff = File.stat(f)
if ff.directory? then
Find.prune unless cleanTree
elsif ff.file? and f =~ /Meta|Data/ then
hIno[ff.ino] = ff.nlink if ff.nlink > 1
# List of files has [name, atime, size, # links, inode]
lst << [f, ff.atime, ff.size, ff.nlink, ff.ino]
end
end
rescue
nil # File.stat can fail because file could have been deleted, ignore error
end
}
# Compute total size
size = lst.inject(0){ |tot, a| tot + if a[3] > 0 then a[2]/a[3] else 0 end }
# Delete old *.Data.[md5] files that are not being referenced anymore/
lst.each { |a|
if a[3] == 1 && a[0] =~ /\.Data\.[0-9a-f]+(.gz)?$/ then
# Data file with no more links pointing to it
FileUtils.rm_rf(a[0])
nDeleted += 1
size -= a[2]
a[3] = 0 # Mark as deleted
end
}
# Sort all files on atime
lst.sort!{ |a1, a2| a1[1] <=> a2[1] }
t0 = Time.new
# Clean until size < maxSize _AND_ atime more recent than TTL
lst.each { |a|
break if size < @maxSize and t0-a[1] < ttl
next if a[3] < 1 # Already deleted in previous step
FileUtils.rm_rf(a[0])
nDeleted += 1
# Total size -= file size IF last link to data
if a[3] == 1 || hIno[a[4]] <= 1 then
size -= a[2]
end
hIno[a[4]] -= 1 if hIno[a[4]] > 0
a[3] = 0 # Mark as deleted by setting nlinks to 0
}
# Remove deleted files from array
lst.reject! { |a| a[3] < 1 }
# Sort files per directory to enforce maxFiles
if cleanTree then
# Split the array in an array per directory
# and keep the files sorted on atime in each directory
slst = Hash.new
lst.length.times {
a = lst.shift
d = File.dirname(a[0])
if slst[d] then
slst[d] << a
else
slst[d] = [a]
end
}
else
# If not cleaning whole tree, we have only a single dir
slst = {"." => lst}
end
nDirectories = slst.length
slst.each { |d, lst|
# Remove oldest files so that we have less than @maxFiles in it
if lst.length >= @maxFiles then
# Remove to leave up 90% of #maxFiles so we don't clean up only a handful of files repeatedly
(lst.length - 9*@maxFiles/10).times {
if a = lst.shift then
FileUtils.rm_rf(a[0])
nDeleted += 1
# Total size -= file size IF last link to data
if a[3] == 1 || hIno[a[4]] <= 1 then
size -= a[2]
end
hIno[a[4]] -= 1 if hIno[a[4]] > 0
end
}
end
}
} #end of chdir
[size, nDeleted, nDirectories]
end
def Cache.makeNames(obj, params)
# Build meta filename and data filename from arguments
#
# obj is broken into a path and a filename with appended params
# e.g. /proj/en/index.xml?style=printable becomes /proj/en and index.xml+printable+yes
# or .#proj#en#index.xml+printable+yes
# depending on cacheTree param value
# .Meta and .Data are appended respectively to the meta filename and data filename
# Base is the filename without appending params, e.g. .#proj#en#index.xml.Data
if @cacheTree then
# Use a path and a file
dir = "#{@cacheDir}#{File.dirname(obj)}"
base = f = File.basename(obj)
else
# Convert full path into a single filename
dir = @cacheDir
base = f = ".#{obj.gsub(/\//,'#')}"
end
f = "#{f}+#{params.reject{|k,v| k.nil?}.sort.join('+')}" if params && params.to_a.length > 0
# Remove funky chars and squeeze duplicates into single chars
f = f.gsub(/[^\w\#.+_-]/, "~").squeeze("~.#+")
# Return names for Data and Meta files, and just the filepath (e.g. #proj#en#index.xml)
[dir, "#{dir}/#{base}.Data", "#{dir}/#{f}.Data#{@zip}", "#{dir}/#{f}.Meta"]
end
end
end
|