#!/usr/bin/perl ########################################################################### # build_referer_db.pl # Maintain a persistent tree of referers mined from access logs # against which to later query for a given page # # Maintained by l.m.orchard http://www.decafbad.com # # To do: # - Delete referers past a certain age (or leave it all permanent?) # - Test reachability of links # - Pull title from linking page # # This program is free software; you can redistribute it and/or # modify it under the terms of the GNU General Public License # as published by the Free Software Foundation; either version 2 # of the License, or (at your option) any later version. # # This program is distributed in the hope that it will be useful, # but WITHOUT ANY WARRANTY; without even the implied warranty of # MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the # GNU General Public License for more details, published at # http://www.gnu.org/copyleft/gpl.html ########################################################################### $|--; use strict; use URI::URL; use Date::Parse; use Data::Dumper qw(Dumper); use DBI; use LWP::Simple; use vars qw($db_host $db_name $db_user $db_pass); ### Global config my $HOME = "/www/www.decafbad.com"; my $log_fn = "$HOME/logs/access_log"; require "$HOME/conf/mysql_auth.pl"; ### Initialize my $dsn = "DBI:mysql:database=$db_name;host=$db_host"; my $dbh = DBI->connect($dsn, $db_user, $db_pass); my $lastdate_sth = $dbh->prepare (qq^ SELECT UNIX_TIMESTAMP(created) FROM referer_links ORDER BY created DESC LIMIT 1 ^); my $link_select_sth = $dbh->prepare (qq^ SELECT hits, ref_title FROM referer_links WHERE site_path=? AND ref_url=? ^); my $link_new_sth = $dbh->prepare (qq^ INSERT INTO referer_links (site_path, ref_url, ref_title, hits, created) VALUES (?,?,?,1,FROM_UNIXTIME(?)) ^); my $link_inc_sth = $dbh->prepare (qq^ UPDATE referer_links SET hits = hits + 1 WHERE site_path=? AND ref_url=? ^); my $link_title_inc_sth = $dbh->prepare (qq^ UPDATE referer_links SET hits = hits + 1, ref_title = ? WHERE site_path=? AND ref_url=? ^); ### Get the datestamp on the newest hit we've already recorded $lastdate_sth->execute(); my ($lastdate) = $lastdate_sth->fetchrow_array(); $lastdate_sth->finish(); #$lastdate=0; ### Parse the access log, extract referrals open(FIN, $log_fn); my $row_cnt = 0; while (my $line = ) { #last if ($row_cnt++ > $max_row); $row_cnt++; my ($remote_host, $remote_user, $username, $date, $request, $response_code, $content_len, $referer, $user_agent) = ($line =~ /([\d\.]+) ([^ ]+) ([^ ]+) \[([^\]]+)\] "([^"]+)" ([^ ]+) ([^ ]+) "([^"]+)" "([^"]+)"/); ### Crudely convert Apache time into UNIX epoch $date =~ /(..)\/(...)\/(....)\:(..)\:(..)\:(..) (.*)/; $date = str2time("$1-$2-$3 $4:$5:$6 $7"); if ( ($row_cnt % 10000) == 0 ) { if ($date < $lastdate) { print "SKIPPED $row_cnt ROWS...\n"; } else { print "PROCESSED $row_cnt ROWS...\n"; } } ### Skip if we already have data newer than this row next if ($date < $lastdate); ### Skip if this row has no referer information next if ($referer eq '-'); my $url = new URI::URL($referer); $referer = "$url"; next if ($request =~ /.gif/); next if ($request =~ /.jpg/); next if ($referer =~ /decafbad.com/); next if ($referer =~ /192.168/); next if ($referer =~ /5335/); next if ($referer =~ /127.0.0.1/); next if ($referer =~ /8888/); next if ($referer =~ /4888/); next if ($referer =~ /48888/); next if ($referer =~ /google.com\/search/); next if ($referer =~ /google.com\/search/); next if ($referer =~ /search.msn.com/); next if ($referer =~ /search.yahoo.com/); next if ($referer =~ /aolsearch.aol.com/); next if ($referer =~ /cometsystems.com/); next if ($referer =~ /tiffany/); next if ($referer eq 'http://radio.outliners.com/instantOutliner'); next if ($referer eq 'http://frontier.userland.com/xmlAggregator'); ### Split the request fields up for easy use my ($request_method, $request_path, $request_proto) = split(/ /, $request); $link_select_sth->execute($request_path, $referer); my ($count, $title) = $link_select_sth->fetchrow_array(); if ($count > 0) { if ($title) { $link_inc_sth->execute($request_path, $referer); } else { my $content = get($referer); my $title = get_title($content); print "UPDATE REFERER TITLE: ".$request_path." - ".$title." ".$referer."\n"; $link_title_inc_sth->execute($title, $request_path, $referer); } } else { my $content = get($referer); if (contains_link($request_path, $content)) { my $title = get_title($content); print "NEW REFERER: ".$request_path." - ".$title." - ".$referer."\n"; $link_new_sth->execute($request_path, $referer, $title, $date); } } } $link_select_sth->finish(); $link_inc_sth->finish(); $link_new_sth->finish(); $dbh->disconnect(); close(FIN); sub contains_link { my ($link, $content) = @_; my $mlink = quotemeta($link); return ($content =~ /$mlink/) ? 1 : undef; } sub get_title { my $content = shift; $content =~ /\<[tT][iI][tT][lL][eE].*\>([^\<]*)\<\/[tT][iI][tT][lL][eE]\>/; my $page_title = $1; $page_title =~ s/^\s+//; $page_title =~ s/\s+$//; return $page_title; }