#!/usr/bin/perl
# --------------------------------- #
# SOPHIAKNOWS #
# --------------------------------- #
# LIBRARY INDEXER #
# --------------------------------- #
# Created: 2000-12-02 #
# Issued: 2000-12-02 #
# Modified: 2004-10-15 #
# Copyright (c) 2004 #
# A.R. Pisarra, SophiaKnows #
# --------------------------------- #
# --------------------------------- #
# A. SUB INDEX_FILE
sub Index_File {
# 1. READ BUFFERED ARGS
$wordlog="$_[0]";
$targetfile="$_[1]";
$targetid="$_[2]";
# 2. INITIALIZE/CLEAR ARRAYS
my %logentries; # hash of stored word entries
my %wordoccurence; # hash of word occurence in file
my @allwords; # all unique words in index
my @newwords; # all unique words in file
# 3. READ EXISTING INDEX INTO HASH
open(WORDLOG,"$wordlog");
@storedrows=<WORDLOG>;
close(WORDLOG);
foreach $storedrow (@storedrows) {
chomp($storedrow);
($storedword,$storedoccurences) = split(/\"\,\"/,$storedrow);
$logentries{$storedword} = $storedoccurences;
push (@allwords,$storedword);
}
# 4. READ/JOIN FILE BODY INTO VARIABLE
my $filebody = JoinFilebody($targetfile);
# 6. READ BODY INTO WORD ARRAY
my @words = ($filebody =~ /\w+/g);
# 7. GET UNIQUE WORD AND COUNT OCCURANCES
$lastword=""; # clear repeat check
foreach $word (sort @words) { # sort new word list
unless($word eq $lastword) { # check for repeated words
push (@newwords,$word); # add nonrepeated words to list
}
$wordoccurence{$word}++; # increment occurence count
$lastword=$word; # reset repeat check
}
# 8. INITIALIZE/APPEND WORD HISTORIES
foreach $newword (@newwords) {
unless($logentries{$newword}) { # unless word in storded list
push (@allwords,$newword); # add word to storded list
}
$logentries{$newword} .= "\,$targetid=$wordoccurence{$freshword}";
}
# 9. STORE UPDATED INDEX TO LOG FILE
open(WORDLOG,">$wordlog");
foreach $allword (sort @allwords) {
$logentries{$allword}=~s/^\,//; # strip any initial commas
print WORDLOG "$allword\"\,\"$logentries{$allword}\n";
}
close(WORDLOG);
}
# B. SUB JOIN FILEBODY
sub JoinFilebody {
$filecontents="";
open (TARGETFILE,"$_[0]"); # open/read file
@lines=<TARGETFILE>;
close(TARGETFILE);
foreach $line(@lines){
chomp ($line); # strip line ends
$filecontents.=$line." "; # concatenate lines
}
$filecontents =~ tr/A-Z/a-z/; # lowercase all
$filecontents =~ s/<!--.+?-->//g; # strip comments
$filecontents =~ s/<script[^>]*>.+?<\/script>//g; # strip scripts
$filecontents =~ s/<style[^>]*>.+?<\/style>//g; # strip styles
$filecontents =~ s/<[^>]+>//g; # strip html
return $filecontents;
}
1;
#!/usr/bin/perl
# --------------------------------- #
# LIBRARY SEARCH BASIC #
# --------------------------------- #
# Created: 2000-12-02 #
# Issued: 2000-12-02 #
# Modified: 2004-10-15 #
# Copyright (c) 2004 #
# A.R. Pisarra, SophiaKnows #
# --------------------------------- #
# --------------------------------- #
# This basic version of the library
# search:
#
# (1) Performs a quasi boolean
# search of the library index for
# files containing 1 or more instances
# of all words passed to the search
# function; and
#
# (2) Prints a relevancy ranked
# list of the files matching the
# searched criteria
# DIRECTORIES/FILES
$the_dir="/data/";
$wordindex=$the_dir."index.txt";
$pagekey="$the_dir."pages.txt";
# INCLUDE/INVOKE PARSEFORM
require "parseform.pl";
&Parse_Form;
# SEARCH WORD(S)
$searchwords=$formdata{'restrictions'};
$searchwords=~tr/A-Z/a-z/;
@searchwords=split(" ",$searchwords);
# INITIALIZE SEARCH INDEX
&InitWordlog($wordindex);
# INITIALIZE PAGE KEY
&InitPageKey($pagekey);
# FIND MATCHED ENTRIES
foreach $searchword (@searchwords) {
if($wordhistories{$searchword}) {
@wordhits=split(/\,/,$wordhistories{$searchword});
foreach $pagehit (@wordhits) {
($pid,$hits)=split(/=/,$pagehit);
$pagematches{$pid}++;
$matches{$pid}+=$hits;
}
}
}
# PRINT RESULTS
print "Content-type: text/html\n\n";
print "<p>PAGE HITS IN DESCENDING RANKED ORDER:</p>\n";
foreach $key (sort HashByDescendingValues (keys(%matches))) {
if($pagematches{$key}>$#searchwords) {
print "<br />$matches{$key} HITS \@ ";
print "<a href=$pagefiles{$key}>$pagetitles{$key}</a>\n";
}
}
# SUBROUTINES:
# SUB: InitWordlog
sub InitWordlog {
open(WORDLIST,"$_[0]");
@lines=<WORDLIST>;
close(WORDLIST);
foreach $line (@lines) {
chomp($line);
($storedword,$history)=split(/\",\"/,$line);
$wordhistories{$storedword}=$history;
}
}
# SUB: InitPageKey
sub InitPageKey {
open(PAGEKEY,"$pagekey");
@pages=<PAGEKEY>;
close(PAGEKEY);
foreach $page (@pages) {
chomp($page);
($pageid,$pagefile,$pagetitle)=split(/\t/,$page);
$pagefiles{$pageid}=$pagefile;
$pagetitles{$pageid}=$pagetitle;
}
}
# SUB: HashByDescendingValues
sub HashByDescendingValues {
$matches{$b} <=> $matches{$a};
}
1;
#!/usr/bin/perl
# --------------------------------- #
# SOPHIAKNOWS #
# --------------------------------- #
# LIBRARY SEARCH ADVANCED #
# --------------------------------- #
# Created: 2000-12-02 #
# Issued: 2000-12-02 #
# Modified: 2004-10-15 #
# Copyright (c) 2004 #
# A.R. Pisarra, SophiaKnows #
# --------------------------------- #
# --------------------------------- #
# This version of library search adds
# a negative word feature to the basic
# search method
# FILES
$the_dir="/data/";
$wordindex=$the_dir."index.txt";
$pagekey="$the_dir."pages.txt";
# INCLUDE/INVOKE PARSEFORM
require "parseform.pl";
&Parse_Form;
# SEARCH WORD(S)
$searchwords=$formdata{'restrictions'};
$searchwords=~tr/A-Z/a-z/;
@searchwords=split(" ",$searchwords);
# INITIALIZE SEARCH INDEX
&InitWordlog($wordindex);
# INITIALIZE PAGE KEY
&InitPageKey($pagekey);
# FIND MATCHED ENTRIES
foreach $searchword (@searchwords) {
unless($searchword=~/^\^/) {
if($wordhistories{$searchword}) {
@wordhits=split(/\,/,$wordhistories{$searchword});
foreach $pagehit (@wordhits) {
($pid,$hits)=split(/=/,$pagehit);
$pagematches{$pid}++;
$matches{$pid}+=$hits;
}
}
} else { # CHECK FOR STOPWORDS
$stopwords++;
$searchword=~s/^\^//;
if($wordhistories{$searchword}) {
@wordhits=split(/\,/,$wordhistories{$searchword});
foreach $pagehit (@wordhits) {
($pid,$hits)=split(/=/,$pagehit);
$notmatches{$pid}++;
}
}
}
}
# ID FILES NOT INCLUDING STOPWORD HITS
if($stopwords) {
foreach $key (keys(%matches)) {
unless($notmatches{$key}) {
$pagematches{$key}=$pagematches{$key}+$stopwords;
}
}
}
# PRINT RESULTS
print "Content-type: text/html\n\n";
print "<p>PAGE HITS IN DESCENDING RANKED ORDER:</p>\n";
foreach $key (sort HashByDescendingValues (keys(%matches))) {
if($pagematches{$key}>$#searchwords) {
print "<br />$matches{$key} HITS \@ ";
print "<a href=$pagefiles{$key}>$pagetitles{$key}</a>\n";
}
}
# SUBROUTINES:
# SUB: InitWordlog
sub InitWordlog {
open(WORDLIST,"$_[0]");
@lines=<WORDLIST>;
close(WORDLIST);
foreach $line (@lines) {
chomp($line);
($storedword,$history)=split(/\",\"/,$line);
$wordhistories{$storedword}=$history;
}
}
# SUB: InitPageKey
sub InitPageKey {
open(PAGEKEY,"$pagekey");
@pages=<PAGEKEY>;
close(PAGEKEY);
foreach $page (@pages) {
chomp($page);
($pageid,$pagefile,$pagetitle)=split(/\t/,$page);
$pagefiles{$pageid}=$pagefile;
$pagetitles{$pageid}=$pagetitle;
}
}
# SUB: HashByDescendingValues
sub HashByDescendingValues {
$matches{$b} <=> $matches{$a};
}
1;