aboutsummaryrefslogtreecommitdiff
diff options
context:
space:
mode:
-rw-r--r--Changelog7
-rwxr-xr-xbin/gorg7
-rw-r--r--data/gorg/schema.sql19
-rw-r--r--etc/gorg/gorg.conf.sample11
-rwxr-xr-xlib/gorg/cgi-bin/search.cgi50
-rw-r--r--lib/gorg/search.rb444
6 files changed, 8 insertions, 530 deletions
diff --git a/Changelog b/Changelog
index cd3570f..2cf71e1 100644
--- a/Changelog
+++ b/Changelog
@@ -123,3 +123,10 @@
2012-10-26 gorg-0.6.5 Maintenance release
. Compatibility fixes for Ruby 1.9
+
+======================================================
+
+2012-..-.. gorg-...
+ . Remove search support. It is not used on gentoo.org, and
+ it depends on the obsolete dbi code which does not work
+ with Ruby 1.9.
diff --git a/bin/gorg b/bin/gorg
index 4171338..0c7a952 100755
--- a/bin/gorg
+++ b/bin/gorg
@@ -49,7 +49,6 @@ Available options:
NB: relative paths in xml are from current directory
absolute paths are from {root} in config file
-v, --version : display gorg version number
--I, --index : scan and index xml files
--param N V : parameter name and value to be passed to the XSL processor
It can appear more than once
e.g. gorg<file.xml --param N1 V1 --param N2 V2
@@ -67,12 +66,6 @@ if ARGV.length == 1 and ['-W', '--web'].include?(ARGV[0]) then
elsif ARGV.length == 1 and ['-C', '--clean-cache'].include?(ARGV[0]) then
# Cache clean up requested, do not bother about STDIN
Cache.washCache($Config["cacheDir"], tmout=900, cleanTree=true)
-elsif ARGV.length == 1 and ['-I', '--index'].include?(ARGV[0]) then
- require 'gorg/search'
- # Index xml files, do not bother about STDIN
- gs = GDig::GSearch.new
- gs.cleanup # Remove old files
- gs.indexDir # Scan for new/modified files
elsif ARGV.include?('-F') or ARGV.include?('--filter') or not STDIN.tty?
# Be a filter by default when data is piped to gorg
# or when -F, --filter is used
diff --git a/data/gorg/schema.sql b/data/gorg/schema.sql
deleted file mode 100644
index 3398c1f..0000000
--- a/data/gorg/schema.sql
+++ /dev/null
@@ -1,19 +0,0 @@
-drop table if exists files;
-create table files(
- id int auto_increment primary key,
- path varchar(255) unique,
- lang varchar(5),
- timestamp varchar(32),
- size bigint,
- txt mediumtext) CHARACTER SET utf8;
-create unique index files_path on files (path(255));
-create index files_lang on files (lang);
-create fulltext index files_txt on files (txt);
-
-drop table if exists savedsearches;
-create table savedsearches(
- words tinytext,
- bool char(1),
- lang varchar(5),
- result mediumblob);
-create index savedsearches_words on savedsearches(lang, words(200));
diff --git a/etc/gorg/gorg.conf.sample b/etc/gorg/gorg.conf.sample
index c3fda72..30f0ab8 100644
--- a/etc/gorg/gorg.conf.sample
+++ b/etc/gorg/gorg.conf.sample
@@ -2,7 +2,7 @@
# Root dir, typically, your DocumentRoot
# (f)cgi scripts find it in their environment but
-# the stand-alone webserver and the search engine need it
+# the stand-alone webserver needs it
root = "/home/neysx/gentoo.org/gentoo/xml/htdocs"
# Make webrick listen on given IP (IP onlyu, no host name)
@@ -109,15 +109,6 @@ accessLog = "syslog"
# Listen on port (must be >1023 to be run by non-root)
port = 8008
-#
-# Search engine parameters
-#
-
-# Connect string, only mysql is supported at the moment
-dbConnect = DBI:mysql:DB_NAME:HOST_NAME
-dbUser = USENAME
-dbPassword = PASSWORD
-
# Document language can be guessed from the document itself with
# an XPath expression. It should return the language code.
# Only the first 5 characters will be used.
diff --git a/lib/gorg/cgi-bin/search.cgi b/lib/gorg/cgi-bin/search.cgi
deleted file mode 100755
index 396001e..0000000
--- a/lib/gorg/cgi-bin/search.cgi
+++ /dev/null
@@ -1,50 +0,0 @@
-#! /usr/bin/ruby
-
-### Copyright 2004, Xavier Neys (neysx@gentoo.org)
-# #
-# # This file is part of gorg.
-# #
-# # gorg is free software; you can redistribute it and/or modify
-# # it under the terms of the GNU General Public License as published by
-# # the Free Software Foundation; either version 2 of the License, or
-# # (at your option) any later version.
-# #
-# # gorg is distributed in the hope that it will be useful,
-# # but WITHOUT ANY WARRANTY; without even the implied warranty of
-# # MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
-# # GNU General Public License for more details.
-# #
-# # You should have received a copy of the GNU General Public License
-# # along with gorg; if not, write to the Free Software
-### Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA 02111-1307 USA
-
-
-require 'cgi'
-require 'gorg/search'
-
-# Make CGI's env public to get access to REQUEST_URI and DOCUMENT_ROOT
-class CGI
- public :env_table
-end
-
-include Gorg
-
-# Config file is named in env var. GORG_CONF, or possibly REDIRECT_GORG_CONF
-# ENV["PATH"] is used as a dirty hackish workaround a limitation of
-# webrick's cgi handler: environment variables can't be passed to cgi's
-# (REDIRECT_)GORG_CONF should be defined when running cgi's under apache
-ENV["GORG_CONF"] = ENV["GORG_CONF"]||ENV["REDIRECT_GORG_CONF"]||ENV["PATH"]
-
-gorgInit
-cgi = CGI.new
-
-# Params
-#
-# l = language code, no param will default to en, empty param defaults to any)
-# q = query string
-# p = page number in search result (0 < p < 1e6)
-# s = page size (9 < p < 120)
-# b = boolean search (y|Y|1 means yes, anything else no)
-
-gs = GDig::GSearch.new
-gs.do_CGI(cgi)
diff --git a/lib/gorg/search.rb b/lib/gorg/search.rb
deleted file mode 100644
index c90448a..0000000
--- a/lib/gorg/search.rb
+++ /dev/null
@@ -1,444 +0,0 @@
-### Copyright 2004, Xavier Neys (neysx@gentoo.org)
-# #
-# # This file is part of gorg.
-# #
-# # gorg is free software; you can redistribute it and/or modify
-# # it under the terms of the GNU General Public License as published by
-# # the Free Software Foundation; either version 2 of the License, or
-# # (at your option) any later version.
-# #
-# # gorg is distributed in the hope that it will be useful,
-# # but WITHOUT ANY WARRANTY; without even the implied warranty of
-# # MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
-# # GNU General Public License for more details.
-# #
-# # You should have received a copy of the GNU General Public License
-# # along with Foobar; if not, write to the Free Software
-### Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA 02111-1307 USA
-
-
-require 'dbi'
-require 'yaml'
-require 'gorg/base'
-require 'cgi'
-
-module GDig
- class GFile
-
- def initialize(root, f, xlang)
- @root = root
- @fname = f
- @xpath2lang = xlang
- end
-
- def txt
- unless @txt then
- @txt, @lang = txtifyFile
- end
- @txt
- end
-
- def lang
- unless @lang then
- @txt, @lang = txtifyFile
- end
- @lang
- end
-
- private
-
- def txtifyFile
- x=Gorg::XSL.new
- x.xsl = <<EOXSL
-<?xml version="1.0" encoding="UTF-8"?>
- <xsl:stylesheet xmlns:xsl="http://www.w3.org/1999/XSL/Transform" version="1.0">
- <xsl:output encoding="UTF-8" method="text" indent="no"/>
- <xsl:template match="/">
-EOXSL
- if (@xpath2lang||"").length > 1 then
- x.xsl << <<EOXSL
- <xsl:if test="#{@xpath2lang}">
- <xsl:value-of select="concat('%%LANG%%', #{@xpath2lang}, '%%&#x0A;')"/>
- </xsl:if>
-EOXSL
- end
- x.xsl << <<EOXSL
- <xsl:apply-templates/>
- </xsl:template>
- <xsl:template match="*">
- <xsl:apply-templates select="@*"/>
- <xsl:apply-templates/>
- </xsl:template>
- <xsl:template match="@*">
- <xsl:value-of select="concat(' ',.,' ')"/>
- </xsl:template>
- </xsl:stylesheet>
-EOXSL
- x.xroot = @root
- x.xml = @fname
- x.process
-
- if x.xerr and x.xerr["xmlErrLevel"] >= 3 then
- raise x.xerr["xmlErrMsg"]
- end
-
- t = x.xres
- if t =~ /^%%LANG%%([^%]+)%%/ then
- l = $1
- t = $'.strip
- else
- l = nil
- end
- t << @fname
- [t.squeeze("\n"), l]
- end
- end
-
- class DBFile
- attr_reader :fid, :webname
- def initialize(dbh, webname, localname)
- @dbh = dbh
- @webname = webname
- @localname = localname
- @row = @dbh.select_one("SELECT id,path,lang,timestamp,size FROM files where path = ?", webname)
- if @row then
- @fid = @row['id']
- else
- @fid = nil
- end
- end
-
- def DBFile.remove(dbh, fid)
- if fid then
- dbh.do("delete from files where id=#{fid}")
- end
- end
-
- def uptodate?
- if @fid then
- unless @row then
- @row = @dbh.select_one("SELECT id,path,lang,timestamp,size FROM files where id=#{@fid}")
- end
- if (fstat=File.stat(@localname)) and @row then
- @row['timestamp']==fstat.mtime.to_s and @row['size']==fstat.size
- else
- false
- end
- end
- end
-
- def update(blob, lang)
- fstat=File.stat(@localname)
- if @fid then
- # update
- sql = "update files set lang = ?, txt = ?, timestamp = ?, size = ? where id=#{@fid}"
- @dbh.do(sql, lang, blob, fstat.mtime.to_s, fstat.size)
- else
- # insert new one
- sql = "insert into files (path, lang, txt, timestamp, size) values (?, ?, ?, ?, ?)"
- @dbh.do(sql, webname, lang, blob, fstat.mtime.to_s, fstat.size)
- if id=@dbh.select_one("select last_insert_id()") then
- @fid = id[0]
- else
- @fid = nil
- end
- end
- end
- end
-
- class GSearch
- attr_reader :dbh, :searchTxt, :searchResult
- include Gorg
-
- def initialize
- @dbh = DBI.connect($Config['dbConnect'], $Config['dbUser'], $Config['dbPassword'])
- @dbh['AutoCommit'] = true
- end
-
- def indexDir
- wipe = false
- scanDir { |webName, localName|
- begin
- dbf = GDig::DBFile.new(@dbh, webName, localName)
- unless dbf.uptodate? then
- gf = GFile.new($Config['root'], webName, $Config['xlang'])
- blob = gf.txt
- lang = gf.lang
- if (lang||"").length < 1 then
- # No lang attribute, see if we can use the filename
- if $Config['flang'] and $Config['flang'].match(webName) then
- lang = $Config['flang'].match(webName)[1]
- end
- end
- dbf.update(blob, lang)
- wipe = true
- debug "#{Time.new.to_i} #{webName} indexed"
- end
- rescue Exception => e
- error "Failed to index #{webName} : #{e.to_s}"
- end
- }
- wipeSearches if wipe
- end
-
- def cleanup
- # Remove files from db either because
- # they should now be excluded or because they do not exist anymore
- wipe = false
- @dbh.select_all('select id, path from files') { |row|
- if not fileMatch(row[1]) or not File.file?($Config['root']+row[1]) then
- DBFile.remove(@dbh, row[0])
- debug "GDig::GSearch: #{row[1]} removed"
- wipe = true
- end
- }
- wipeSearches if wipe
- end
-
- def do_CGI(cgi)
- $Config["root"] = cgi.env_table['DOCUMENT_ROOT']||$Config["root"]
- query = {}
- # Get cookies
- if $Config["acceptCookies"] then
- # Add cookies to our params
- query = cookies_to_params(cgi.cookies)
- end
- # Add URI params that are not used by search engine (p,q,l,s)
- cgi.params.each{ |p, v| query[p] = v.to_s}
-
- # Choose language
- if cgi.has_key?("l") then
- lang = cgi["l"]
- elsif query.has_key?("SL") then
- lang = query["SL"]
- else
- lang = nil
- end
-
- # Perform search
- search(cgi["q"], lang)
-
- if cgi.has_key?("p") and cgi["p"] =~ /^[0-9]{1,5}$/ then
- p = cgi["p"].to_i
- else
- p = 1
- end
-
- if cgi.has_key?("s") and cgi["s"] =~ /^[0-9]{2,3}$/ then
- s = cgi["s"].to_i
- elsif query.has_key?("PL") and query["PL"] =~ /^[0-9]{2,3}$/ then
- s = query["PL"].to_i
- else
- s = 20
- end
- s = 120 if s > 120
-
- xml = xmlResult(p,s)
- header = {}; body = ""
- if cgi.has_key?("passthru") and $Config["passthru"] then
- header = {'type' => 'text/plain'}
- body = xml
- else
- if $Config["linkParam"] then
- query[$Config["linkParam"]] = cgi.script_name
- end
- if $Config["httphost"] then
- # Add HTTP_HOST to stylesheet params
- query["httphost"] = if $Config["httphost"][0] == '*' then
- cgi.host||""
- elsif $Config["httphost"].include?('*') then
- $Config["httphost"][0]
- elsif $Config["httphost"].include?(cgi.host) then
- $Config["httphost"][0]
- else
- cgi.host
- end
- end
-
- err, body, filelist, extra = xproc(xml, query, false)
- if err["xmlErrLevel"] > 0 then
- raise "#{err.collect{|e|e.join(':')}.join('<br/>')}"
- end
- cookies = makeCookies(extra)
- ct = setContentType(body)
- # Turn application/xhtml+xml into text/html if browser does not accept it
- if cgi.accept !~ /application\/xhtml\+xml/ and ct =~ /application\/xhtml\+xml(.*)$/ then
- header = {'type' => "text/html#{$1}"}
- else
- header = {'type' => ct}
- end
-
- # Add cookies to http header
- if cookies then
- header['cookie'] = cookies
- end
- end
- # If client accepts gzip encoding and we support it, return gzipped file
- if $Config["zipLevel"] > 0 and ( cgi.accept_encoding =~ /gzip(\s*;\s*q=([0-9\.]+))?/ and ($2||"1") != "0" ) then
- body = gzip(body, $Config["zipLevel"])
- header['Content-Encoding'] = "gzip"
- header['Vary'] = "Accept-Encoding"
- end
- cgi.out(header){body}
- rescue => ex
- syserr = Gorg::Status::SysError.new
- cgi.out('Status'=>syserr.errSts){syserr.html(ex)}
- error("GSearch::do_CGI() failed: #{$!}")
- end
-
- def search(str, lang)
- @searchTxt = str
- @searchResult = nil
- if (lang||"") == "" then
- @searchLang = '%'
- else
- @searchLang = lang
- end
- if str =~ /(^|\s)(([+<)(>~-][^+<)(>~-]+)|([^+<)(>~-]+\*))(\s|$)/ then
- @searchBool = "Y"
- boolClause = "in boolean mode"
- else
- @searchBool = "N"
- boolClause = ""
- end
- if @searchTxt.length > 0 then
- @searchResult = loadSearch
- unless @searchResult then
- @searchResult = []
- # Perform full text search
- sql = <<EOSQL
-select id, path, lang, match (txt) against ( ? ) as score
-from files
-where lang like ? and match (txt) against ( ? #{boolClause} )
-order by score desc
-EOSQL
- @dbh.select_all(sql, @searchTxt, @searchLang, @searchTxt).each { |r| @searchResult << [r[0],r[1],r[2],r[3]] }
- saveSearch
- end
- end
- @searchResult
- end
-
- def xmlResult(page=1, pageLength=25)
- # <search page="p" pages="n">
- # <for>search string</for>
- # <found link="/path/to/file.xml" lang="fr">
- # blah blah <b>word2</b> bleh
- # </found>
- pageLength = 20 if pageLength < 1
- xml = "<?xml version='1.0' encoding='UTF-8'?>\n\n"
-
- if @searchResult and @searchResult.length >= 1 then
- removeDeadFiles
- nPages = @searchResult.length / pageLength #/
- nPages += 1 unless 0 == @searchResult.length.modulo(pageLength)
- page = nPages if page > nPages
- page = 1 if page < 1
-
- xml << "<search page='#{page}' pages='#{nPages}' pageLength='#{pageLength}' lang='#{xmlEscape(@searchLang)}' bool='#{@searchBool}'>\n"
- xml << xmlSearchFor
- @searchResult[(page-1)*pageLength..page*pageLength-1].each { |r|
- xml << " <found link='#{r[1]}' lang='#{r[2]}' score='#{r[3]}'>\n"
- xml << xmlBlobSample(r[0]) << "\n"
- xml << " </found>\n"
- }
- else
- xml << "<search page='0' pages='0'>\n"
- xml << xmlSearchFor
- end
- xml << "</search>\n"
- end
-
- def scanDir
- Dir.chdir($Config['root']) {
- `find -L . -type f`.split("\n").each{ |localFile|
- if File.file?(localFile) then
- webFile = localFile[1..-1]
- if fileMatch(webFile) then
- yield [webFile, File.expand_path(localFile)]
- end
- end
- }
- }
- end
-
- private
-
- def xmlBlobSample(fileID)
- blob = ""
- r = @dbh.select_one("select txt from files where id = #{fileID}")
- if r then
- blob = r[0]
- # Find first matching word and extract some text around it
- stxt = @searchTxt.tr('`.,\'"\-_+~<>/?;:[]{}+|\\)(*&^%\$\#@!', ' ').split(' ')
- regs = stxt.collect { |w| Regexp.new(w, true, 'U') }
- ix = nil
- regs.each { |r| break if ix=blob.index(r) }
- if ix then
- if ix < 80 then
- x = 0
- else
- x = blob[0,ix-60].rindex(/[ ,\.]/)
- x = 0 unless x
- end
- y = blob.index(/[,\. ]/, ix+80)
- y = -1 unless y
- blob = xmlEscape(blob[x..y])
- # Mark up sought words
- regs.each { |r| blob.gsub!(r){|t| "<b>#{t}</b>"} }
- else
- x = blob[120..-1].index(/[ ,\.]/)
- blob = xmlEscape(blob[0..x])
- end
- end
- blob
- end
-
- def xmlEscape(str)
- if str
- str.gsub('&','&amp;').gsub('>','&gt;').gsub('<','&lt;')
- else
- "w00t"
- end
- end
-
- def loadSearch
- if @searchTxt then
- r = @dbh.select_one("select result from savedsearches where words = ? and lang = ? and bool = ?", @searchTxt, @searchLang, @searchBool)
- if r then
- YAML::load(r[0])
- end
- end
- end
-
- def saveSearch
- if @searchTxt then
- @dbh.do("delete from savedsearches where words = ? and lang = ? and bool = ?", @searchTxt, @searchLang, @searchBool)
- @dbh.do("insert into savedsearches (words, lang, bool, result) values(?, ?, ?, ?)", @searchTxt, @searchLang, @searchBool, @searchResult.to_yaml)
- end
- end
-
- def wipeSearches
- @dbh.do("delete from savedsearches")
- end
-
- def fileMatch(f)
- $Config['in/out'].each { |inout|
- return inout[0] if inout[1].match(f)
- }
- false
- end
-
- def removeDeadFiles
- if @searchResult then
- @searchResult.reject!{ |r| not File.file?($Config['root']+r[1]) }
- end
- end
-
- def xmlSearchFor
- " <for>#{xmlEscape(@searchTxt)}</for>\n" if @searchTxt
- end
-
- end
-
-end