#!/usr/bin/perl # searchtext # Google-style searching through text files on your *nix computer # http://nodivisions.com/software/searchtext/ # By Anthony DiSante, http://nodivisions.com/contact/ # # Changelog: # # 20041007: # # Added "-type f" to the find command, put its -iname argument in # single-quotes, and added *.shtml as one of the "alltext" extensions. # # 20040614: # # Came across a webpage using "stm" for its extension, so I added # that to the default text file list. # # 20040521: # # First public release. use strict; my (%PREF, $msg, $datapath_qs); my $opts = $ARGV[0]; my $path = $ARGV[1]; my $ext = $ARGV[2]; my $terms = $ARGV[3]; $terms =~ s/'/"/g; #die "terms: $terms\n"; my $me = `basename $0`; chomp $me; die qq` $me: does a Google-style search of files on your Unix/Linux computer. Usage: $me - options is either h or t (for html or plaintext output). - searchpath will be searched recursively. - extension is a comma-separated list of one or more extensions (no spaces); only files with these extensions will be searched for your search terms. or you can say "alltext" here to search most common text file extensions (txt,log,htm,html,shtml,php,php3,asp,cfm,conf,stm). - terms is any number of words or phrases; put the whole thing in double- quotes and put phrases in single-quotes. Examples: $me -t /home/me/files/ txt,log "foobar" $me -t /home/me/files/ alltext "foobar 'kick the can' ketchup" ` unless ($opts && $path && $ext && $terms); $path = `pwd` if $path eq '.'; chomp $path; search_posts($path,$ext,$terms); sub search_posts($$$) { my($path,$ext,$string) = @_; my $original_string = $string; my ($front, $mid, $back); # Get the string ready for searching. We're going to do this Google-stylee... # to get a match, multiple terms separated by spaces must ALL be found in the # post, unless (some of) the terms are in quotes, in which case, that exact # sequence of terms must be found in the post. # First condense any multiple spaces: $string =~ s/\s+/ /g; # If the string contains any quoted phrases, temporarily replace spaces # within those phrases with a token that we can check for later: while($string =~ /(.*)"(.+?)"(.*)/) { $front = $1; $mid = $2; $back = $3; $mid =~ s/\s/\+\+TOKEN\+\+/g; $string = $front . $mid . $back; } # So now a search like this: # # blue green "black and white" # # ...becomes this: # # blue green black++TOKEN++and++TOKEN++white # Now split the string on its spaces into an array: my @search_terms = split(/\s/, $string); # Now we're all set. To find a post that matches, we just need to make sure # that it contains every term (some of which may be phrases) in @search_terms. my @files = get_filenames($path,$ext); #print join "\n", @files; #die; my @results; my $num_found = 0; my ($highlight_start, $highlight_end) = $opts =~ /t/ ? ('','') : ('',''); for(@files) { chomp; my $file = $_; my $summary = ''; if( !(-e "$file") ) { next; } open(IN,"<$file") or die "$0: couldn't open dfg4 $file: $!\n"; flock IN, 2; seek IN, 0, 0; my $whole_post = ''; while() { $whole_post .= $_; } close IN or die "$0: couldn't close $file: $!\n"; compress_whitespace($whole_post); strip_html($whole_post) if $opts =~ /t/; # Assume all terms are present, and if we find one that isn't, then we'll unset it. my $it_matches = 1; for(@search_terms) { # First re-replace our tokens with spaces: s/\+\+TOKEN\+\+/ /g; # Now do the actual checking. We surround the search term with "\b"s which are # word boundaries, because otherwise a search for "no" would match "no" but also # match "know" and "not" which is probably not what was intended. if($whole_post !~ /\b$_\b/i) { $it_matches = 0; last; } } if($it_matches) { $num_found++; for(@search_terms) { my ($pre, $term, $post) = ($whole_post =~ /(\b.{0,200}\b)($_)(\b.{0,200}\b)/si); $summary = "$summary ... $pre$highlight_start$term$highlight_end$post"; } $summary .= '... '; if($opts =~ /t/) { push @results, "$num_found. File:\nfile://$file\nSummary:\n$summary\n\n"; } else { push @results, "

$num_found. $file
$summary

\n\n\n\n"; } } } print "

Note: this search " . "works just like Google.  To find posts containing " . "the word black and the word white, just type black white into the box.  To " . "find posts containing the phrase black and white, type \"black and white\" in " . "quotes.  Without the quotes, you'll find posts containing all the words black, and, " . "and white, even if they're not together in a phrase.  Note also that you can search for " . "multiple phrases and/or single terms at the same time.\n

\n\n" unless $opts =~ /t/; print "Search Results:\n\n"; if($num_found < 1) { print "No files"; } if($num_found == 1) { print "1 file"; } if($num_found > 1) { print "$num_found files"; } print " found containing: $original_string.\n\n"; print @results; print "

Advanced search:

\n
" . "search through poster names" . "
search through poster email addresses" . "
search through post dates" . "
search through post subjects" . "
search through post bodies" . "

(You must check at least one.  The normal search checks through all of these.)" . "
\n\n" unless $opts =~ /t/; print "\n$num_found files matched.\n" if $opts =~ /t/; } sub get_filenames($$) { my($path,$ext) = @_; my (@exts,@files); if($ext eq 'alltext') { @exts = ('txt','log','htm','html','shtml','php','php3','asp','cfm','conf','stm'); } else { @exts = split(/,\s*/, $ext); } for(@exts) { #print "debug: find $path -type f -iname '*.$_'\n"; my @f = `find $path -type f -iname '*.$_'`; push @files, @f; } return @files; } sub strip_html { for(@_) { s/<.+?>//g; } } sub compress_whitespace { # turns any \s (which means newlines, etc) into a single space. for(@_) { s/\s+/ /g; } }