diff options
author | Hans de Graaff <hans@degraaff.org> | 2012-10-26 14:33:02 +0200 |
---|---|---|
committer | Hans de Graaff <hans@degraaff.org> | 2012-10-26 14:35:34 +0200 |
commit | 9d53caf784e1b5ed83b68c08309c156e7c49b3d5 (patch) | |
tree | 9d4a1ef935fba86be6090e862a8e9ff59c28f994 | |
parent | Add RSpec files so that we can start creating specifications. (diff) | |
download | gorg-9d53caf784e1b5ed83b68c08309c156e7c49b3d5.tar.gz gorg-9d53caf784e1b5ed83b68c08309c156e7c49b3d5.tar.bz2 gorg-9d53caf784e1b5ed83b68c08309c156e7c49b3d5.zip |
Remove search functionality based on obsolete dbi.
-rw-r--r-- | Changelog | 7 | ||||
-rwxr-xr-x | bin/gorg | 7 | ||||
-rw-r--r-- | data/gorg/schema.sql | 19 | ||||
-rw-r--r-- | etc/gorg/gorg.conf.sample | 11 | ||||
-rwxr-xr-x | lib/gorg/cgi-bin/search.cgi | 50 | ||||
-rw-r--r-- | lib/gorg/search.rb | 444 |
6 files changed, 8 insertions, 530 deletions
@@ -123,3 +123,10 @@ 2012-10-26 gorg-0.6.5 Maintenance release . Compatibility fixes for Ruby 1.9 + +====================================================== + +2012-..-.. gorg-... + . Remove search support. It is not used on gentoo.org, and + it depends on the obsolete dbi code which does not work + with Ruby 1.9. @@ -49,7 +49,6 @@ Available options: NB: relative paths in xml are from current directory absolute paths are from {root} in config file -v, --version : display gorg version number --I, --index : scan and index xml files --param N V : parameter name and value to be passed to the XSL processor It can appear more than once e.g. gorg<file.xml --param N1 V1 --param N2 V2 @@ -67,12 +66,6 @@ if ARGV.length == 1 and ['-W', '--web'].include?(ARGV[0]) then elsif ARGV.length == 1 and ['-C', '--clean-cache'].include?(ARGV[0]) then # Cache clean up requested, do not bother about STDIN Cache.washCache($Config["cacheDir"], tmout=900, cleanTree=true) -elsif ARGV.length == 1 and ['-I', '--index'].include?(ARGV[0]) then - require 'gorg/search' - # Index xml files, do not bother about STDIN - gs = GDig::GSearch.new - gs.cleanup # Remove old files - gs.indexDir # Scan for new/modified files elsif ARGV.include?('-F') or ARGV.include?('--filter') or not STDIN.tty? # Be a filter by default when data is piped to gorg # or when -F, --filter is used diff --git a/data/gorg/schema.sql b/data/gorg/schema.sql deleted file mode 100644 index 3398c1f..0000000 --- a/data/gorg/schema.sql +++ /dev/null @@ -1,19 +0,0 @@ -drop table if exists files; -create table files( - id int auto_increment primary key, - path varchar(255) unique, - lang varchar(5), - timestamp varchar(32), - size bigint, - txt mediumtext) CHARACTER SET utf8; -create unique index files_path on files (path(255)); -create index files_lang on files (lang); -create fulltext index files_txt on files (txt); - -drop table if exists savedsearches; -create table savedsearches( - words tinytext, - bool char(1), - lang varchar(5), - result mediumblob); -create index savedsearches_words on savedsearches(lang, words(200)); diff --git a/etc/gorg/gorg.conf.sample b/etc/gorg/gorg.conf.sample index c3fda72..30f0ab8 100644 --- a/etc/gorg/gorg.conf.sample +++ b/etc/gorg/gorg.conf.sample @@ -2,7 +2,7 @@ # Root dir, typically, your DocumentRoot # (f)cgi scripts find it in their environment but -# the stand-alone webserver and the search engine need it +# the stand-alone webserver needs it root = "/home/neysx/gentoo.org/gentoo/xml/htdocs" # Make webrick listen on given IP (IP onlyu, no host name) @@ -109,15 +109,6 @@ accessLog = "syslog" # Listen on port (must be >1023 to be run by non-root) port = 8008 -# -# Search engine parameters -# - -# Connect string, only mysql is supported at the moment -dbConnect = DBI:mysql:DB_NAME:HOST_NAME -dbUser = USENAME -dbPassword = PASSWORD - # Document language can be guessed from the document itself with # an XPath expression. It should return the language code. # Only the first 5 characters will be used. diff --git a/lib/gorg/cgi-bin/search.cgi b/lib/gorg/cgi-bin/search.cgi deleted file mode 100755 index 396001e..0000000 --- a/lib/gorg/cgi-bin/search.cgi +++ /dev/null @@ -1,50 +0,0 @@ -#! /usr/bin/ruby - -### Copyright 2004, Xavier Neys (neysx@gentoo.org) -# # -# # This file is part of gorg. -# # -# # gorg is free software; you can redistribute it and/or modify -# # it under the terms of the GNU General Public License as published by -# # the Free Software Foundation; either version 2 of the License, or -# # (at your option) any later version. -# # -# # gorg is distributed in the hope that it will be useful, -# # but WITHOUT ANY WARRANTY; without even the implied warranty of -# # MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the -# # GNU General Public License for more details. -# # -# # You should have received a copy of the GNU General Public License -# # along with gorg; if not, write to the Free Software -### Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA 02111-1307 USA - - -require 'cgi' -require 'gorg/search' - -# Make CGI's env public to get access to REQUEST_URI and DOCUMENT_ROOT -class CGI - public :env_table -end - -include Gorg - -# Config file is named in env var. GORG_CONF, or possibly REDIRECT_GORG_CONF -# ENV["PATH"] is used as a dirty hackish workaround a limitation of -# webrick's cgi handler: environment variables can't be passed to cgi's -# (REDIRECT_)GORG_CONF should be defined when running cgi's under apache -ENV["GORG_CONF"] = ENV["GORG_CONF"]||ENV["REDIRECT_GORG_CONF"]||ENV["PATH"] - -gorgInit -cgi = CGI.new - -# Params -# -# l = language code, no param will default to en, empty param defaults to any) -# q = query string -# p = page number in search result (0 < p < 1e6) -# s = page size (9 < p < 120) -# b = boolean search (y|Y|1 means yes, anything else no) - -gs = GDig::GSearch.new -gs.do_CGI(cgi) diff --git a/lib/gorg/search.rb b/lib/gorg/search.rb deleted file mode 100644 index c90448a..0000000 --- a/lib/gorg/search.rb +++ /dev/null @@ -1,444 +0,0 @@ -### Copyright 2004, Xavier Neys (neysx@gentoo.org) -# # -# # This file is part of gorg. -# # -# # gorg is free software; you can redistribute it and/or modify -# # it under the terms of the GNU General Public License as published by -# # the Free Software Foundation; either version 2 of the License, or -# # (at your option) any later version. -# # -# # gorg is distributed in the hope that it will be useful, -# # but WITHOUT ANY WARRANTY; without even the implied warranty of -# # MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the -# # GNU General Public License for more details. -# # -# # You should have received a copy of the GNU General Public License -# # along with Foobar; if not, write to the Free Software -### Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA 02111-1307 USA - - -require 'dbi' -require 'yaml' -require 'gorg/base' -require 'cgi' - -module GDig - class GFile - - def initialize(root, f, xlang) - @root = root - @fname = f - @xpath2lang = xlang - end - - def txt - unless @txt then - @txt, @lang = txtifyFile - end - @txt - end - - def lang - unless @lang then - @txt, @lang = txtifyFile - end - @lang - end - - private - - def txtifyFile - x=Gorg::XSL.new - x.xsl = <<EOXSL -<?xml version="1.0" encoding="UTF-8"?> - <xsl:stylesheet xmlns:xsl="http://www.w3.org/1999/XSL/Transform" version="1.0"> - <xsl:output encoding="UTF-8" method="text" indent="no"/> - <xsl:template match="/"> -EOXSL - if (@xpath2lang||"").length > 1 then - x.xsl << <<EOXSL - <xsl:if test="#{@xpath2lang}"> - <xsl:value-of select="concat('%%LANG%%', #{@xpath2lang}, '%%
')"/> - </xsl:if> -EOXSL - end - x.xsl << <<EOXSL - <xsl:apply-templates/> - </xsl:template> - <xsl:template match="*"> - <xsl:apply-templates select="@*"/> - <xsl:apply-templates/> - </xsl:template> - <xsl:template match="@*"> - <xsl:value-of select="concat(' ',.,' ')"/> - </xsl:template> - </xsl:stylesheet> -EOXSL - x.xroot = @root - x.xml = @fname - x.process - - if x.xerr and x.xerr["xmlErrLevel"] >= 3 then - raise x.xerr["xmlErrMsg"] - end - - t = x.xres - if t =~ /^%%LANG%%([^%]+)%%/ then - l = $1 - t = $'.strip - else - l = nil - end - t << @fname - [t.squeeze("\n"), l] - end - end - - class DBFile - attr_reader :fid, :webname - def initialize(dbh, webname, localname) - @dbh = dbh - @webname = webname - @localname = localname - @row = @dbh.select_one("SELECT id,path,lang,timestamp,size FROM files where path = ?", webname) - if @row then - @fid = @row['id'] - else - @fid = nil - end - end - - def DBFile.remove(dbh, fid) - if fid then - dbh.do("delete from files where id=#{fid}") - end - end - - def uptodate? - if @fid then - unless @row then - @row = @dbh.select_one("SELECT id,path,lang,timestamp,size FROM files where id=#{@fid}") - end - if (fstat=File.stat(@localname)) and @row then - @row['timestamp']==fstat.mtime.to_s and @row['size']==fstat.size - else - false - end - end - end - - def update(blob, lang) - fstat=File.stat(@localname) - if @fid then - # update - sql = "update files set lang = ?, txt = ?, timestamp = ?, size = ? where id=#{@fid}" - @dbh.do(sql, lang, blob, fstat.mtime.to_s, fstat.size) - else - # insert new one - sql = "insert into files (path, lang, txt, timestamp, size) values (?, ?, ?, ?, ?)" - @dbh.do(sql, webname, lang, blob, fstat.mtime.to_s, fstat.size) - if id=@dbh.select_one("select last_insert_id()") then - @fid = id[0] - else - @fid = nil - end - end - end - end - - class GSearch - attr_reader :dbh, :searchTxt, :searchResult - include Gorg - - def initialize - @dbh = DBI.connect($Config['dbConnect'], $Config['dbUser'], $Config['dbPassword']) - @dbh['AutoCommit'] = true - end - - def indexDir - wipe = false - scanDir { |webName, localName| - begin - dbf = GDig::DBFile.new(@dbh, webName, localName) - unless dbf.uptodate? then - gf = GFile.new($Config['root'], webName, $Config['xlang']) - blob = gf.txt - lang = gf.lang - if (lang||"").length < 1 then - # No lang attribute, see if we can use the filename - if $Config['flang'] and $Config['flang'].match(webName) then - lang = $Config['flang'].match(webName)[1] - end - end - dbf.update(blob, lang) - wipe = true - debug "#{Time.new.to_i} #{webName} indexed" - end - rescue Exception => e - error "Failed to index #{webName} : #{e.to_s}" - end - } - wipeSearches if wipe - end - - def cleanup - # Remove files from db either because - # they should now be excluded or because they do not exist anymore - wipe = false - @dbh.select_all('select id, path from files') { |row| - if not fileMatch(row[1]) or not File.file?($Config['root']+row[1]) then - DBFile.remove(@dbh, row[0]) - debug "GDig::GSearch: #{row[1]} removed" - wipe = true - end - } - wipeSearches if wipe - end - - def do_CGI(cgi) - $Config["root"] = cgi.env_table['DOCUMENT_ROOT']||$Config["root"] - query = {} - # Get cookies - if $Config["acceptCookies"] then - # Add cookies to our params - query = cookies_to_params(cgi.cookies) - end - # Add URI params that are not used by search engine (p,q,l,s) - cgi.params.each{ |p, v| query[p] = v.to_s} - - # Choose language - if cgi.has_key?("l") then - lang = cgi["l"] - elsif query.has_key?("SL") then - lang = query["SL"] - else - lang = nil - end - - # Perform search - search(cgi["q"], lang) - - if cgi.has_key?("p") and cgi["p"] =~ /^[0-9]{1,5}$/ then - p = cgi["p"].to_i - else - p = 1 - end - - if cgi.has_key?("s") and cgi["s"] =~ /^[0-9]{2,3}$/ then - s = cgi["s"].to_i - elsif query.has_key?("PL") and query["PL"] =~ /^[0-9]{2,3}$/ then - s = query["PL"].to_i - else - s = 20 - end - s = 120 if s > 120 - - xml = xmlResult(p,s) - header = {}; body = "" - if cgi.has_key?("passthru") and $Config["passthru"] then - header = {'type' => 'text/plain'} - body = xml - else - if $Config["linkParam"] then - query[$Config["linkParam"]] = cgi.script_name - end - if $Config["httphost"] then - # Add HTTP_HOST to stylesheet params - query["httphost"] = if $Config["httphost"][0] == '*' then - cgi.host||"" - elsif $Config["httphost"].include?('*') then - $Config["httphost"][0] - elsif $Config["httphost"].include?(cgi.host) then - $Config["httphost"][0] - else - cgi.host - end - end - - err, body, filelist, extra = xproc(xml, query, false) - if err["xmlErrLevel"] > 0 then - raise "#{err.collect{|e|e.join(':')}.join('<br/>')}" - end - cookies = makeCookies(extra) - ct = setContentType(body) - # Turn application/xhtml+xml into text/html if browser does not accept it - if cgi.accept !~ /application\/xhtml\+xml/ and ct =~ /application\/xhtml\+xml(.*)$/ then - header = {'type' => "text/html#{$1}"} - else - header = {'type' => ct} - end - - # Add cookies to http header - if cookies then - header['cookie'] = cookies - end - end - # If client accepts gzip encoding and we support it, return gzipped file - if $Config["zipLevel"] > 0 and ( cgi.accept_encoding =~ /gzip(\s*;\s*q=([0-9\.]+))?/ and ($2||"1") != "0" ) then - body = gzip(body, $Config["zipLevel"]) - header['Content-Encoding'] = "gzip" - header['Vary'] = "Accept-Encoding" - end - cgi.out(header){body} - rescue => ex - syserr = Gorg::Status::SysError.new - cgi.out('Status'=>syserr.errSts){syserr.html(ex)} - error("GSearch::do_CGI() failed: #{$!}") - end - - def search(str, lang) - @searchTxt = str - @searchResult = nil - if (lang||"") == "" then - @searchLang = '%' - else - @searchLang = lang - end - if str =~ /(^|\s)(([+<)(>~-][^+<)(>~-]+)|([^+<)(>~-]+\*))(\s|$)/ then - @searchBool = "Y" - boolClause = "in boolean mode" - else - @searchBool = "N" - boolClause = "" - end - if @searchTxt.length > 0 then - @searchResult = loadSearch - unless @searchResult then - @searchResult = [] - # Perform full text search - sql = <<EOSQL -select id, path, lang, match (txt) against ( ? ) as score -from files -where lang like ? and match (txt) against ( ? #{boolClause} ) -order by score desc -EOSQL - @dbh.select_all(sql, @searchTxt, @searchLang, @searchTxt).each { |r| @searchResult << [r[0],r[1],r[2],r[3]] } - saveSearch - end - end - @searchResult - end - - def xmlResult(page=1, pageLength=25) - # <search page="p" pages="n"> - # <for>search string</for> - # <found link="/path/to/file.xml" lang="fr"> - # blah blah <b>word2</b> bleh - # </found> - pageLength = 20 if pageLength < 1 - xml = "<?xml version='1.0' encoding='UTF-8'?>\n\n" - - if @searchResult and @searchResult.length >= 1 then - removeDeadFiles - nPages = @searchResult.length / pageLength #/ - nPages += 1 unless 0 == @searchResult.length.modulo(pageLength) - page = nPages if page > nPages - page = 1 if page < 1 - - xml << "<search page='#{page}' pages='#{nPages}' pageLength='#{pageLength}' lang='#{xmlEscape(@searchLang)}' bool='#{@searchBool}'>\n" - xml << xmlSearchFor - @searchResult[(page-1)*pageLength..page*pageLength-1].each { |r| - xml << " <found link='#{r[1]}' lang='#{r[2]}' score='#{r[3]}'>\n" - xml << xmlBlobSample(r[0]) << "\n" - xml << " </found>\n" - } - else - xml << "<search page='0' pages='0'>\n" - xml << xmlSearchFor - end - xml << "</search>\n" - end - - def scanDir - Dir.chdir($Config['root']) { - `find -L . -type f`.split("\n").each{ |localFile| - if File.file?(localFile) then - webFile = localFile[1..-1] - if fileMatch(webFile) then - yield [webFile, File.expand_path(localFile)] - end - end - } - } - end - - private - - def xmlBlobSample(fileID) - blob = "" - r = @dbh.select_one("select txt from files where id = #{fileID}") - if r then - blob = r[0] - # Find first matching word and extract some text around it - stxt = @searchTxt.tr('`.,\'"\-_+~<>/?;:[]{}+|\\)(*&^%\$\#@!', ' ').split(' ') - regs = stxt.collect { |w| Regexp.new(w, true, 'U') } - ix = nil - regs.each { |r| break if ix=blob.index(r) } - if ix then - if ix < 80 then - x = 0 - else - x = blob[0,ix-60].rindex(/[ ,\.]/) - x = 0 unless x - end - y = blob.index(/[,\. ]/, ix+80) - y = -1 unless y - blob = xmlEscape(blob[x..y]) - # Mark up sought words - regs.each { |r| blob.gsub!(r){|t| "<b>#{t}</b>"} } - else - x = blob[120..-1].index(/[ ,\.]/) - blob = xmlEscape(blob[0..x]) - end - end - blob - end - - def xmlEscape(str) - if str - str.gsub('&','&').gsub('>','>').gsub('<','<') - else - "w00t" - end - end - - def loadSearch - if @searchTxt then - r = @dbh.select_one("select result from savedsearches where words = ? and lang = ? and bool = ?", @searchTxt, @searchLang, @searchBool) - if r then - YAML::load(r[0]) - end - end - end - - def saveSearch - if @searchTxt then - @dbh.do("delete from savedsearches where words = ? and lang = ? and bool = ?", @searchTxt, @searchLang, @searchBool) - @dbh.do("insert into savedsearches (words, lang, bool, result) values(?, ?, ?, ?)", @searchTxt, @searchLang, @searchBool, @searchResult.to_yaml) - end - end - - def wipeSearches - @dbh.do("delete from savedsearches") - end - - def fileMatch(f) - $Config['in/out'].each { |inout| - return inout[0] if inout[1].match(f) - } - false - end - - def removeDeadFiles - if @searchResult then - @searchResult.reject!{ |r| not File.file?($Config['root']+r[1]) } - end - end - - def xmlSearchFor - " <for>#{xmlEscape(@searchTxt)}</for>\n" if @searchTxt - end - - end - -end |