#!/usr/bin/perl
#
# gscloud.pl - Google Search Cloud
#
# (c) Copyright, 2006 By John Bokma, http://johnbokma.com/
#
# Last updated: 2006-10-05 01:41:44 -0500

use strict;
use warnings;

use Carp;
use Encode;
use HTML::Entities;
use URI::Escape;
use Getopt::Long;

my $time = time;
my $steps = 18;
my $mapping = 'log';
my $sort = 'alpha';
my $limit = 75;
my $scale = 0;
my $prefix = '';


sub print_usage_and_exit {

    print <<USAGE;
usage: gscloud.pl [OPTIONS] ACCESS_LOG

options:

    steps   - number of cloud sizes, default $steps
    mapping - log or lin, default $mapping
    sort    - alpha or num, default $sort
    limit   - maximum number of phrases, default $limit
    scale   - scale when phrases less then steps, default $scale
    prefix  - prefix for paths (creates links), default none
USAGE

    exit;
}


GetOptions(

    "steps=i"   => \$steps,
    "mapping=s" => \$mapping,
    "sort=s"    => \$sort,
    "limit=i"   => \$limit,
    "scale=i"   => \$scale,
    "prefix=s"  => \$prefix
);

my $filename = shift;
defined $filename or print_usage_and_exit;

open my $fh, $filename or
    die "Can't open '$filename' for reading: $!";

my %stats;
while ( my $line = <$fh> ) {

    $line =~ m!

        \[\d{2}/\w{3}/\d{4}(?::\d\d){3}.+?\]
        \s"GET\s(\S+)\sHTTP/\d.\d"
        \s(\S+)
        \s\S+
        \s"http://w{1,3}\.google\.
        (?:[a-z]{2}|com?\.[a-z]{2}|com)\.?/
        [^\"]*q=([^\"&]+)[^\"]*"

    !xi or next;

    my ( $path, $status, $query ) = ( $1, $2, $3 );

    $query =~ s/\+/ /g;
    $query = join ' ' => split ' ', uri_unescape $query;

    $stats{ "$path:$status" }{ sum }++;
    $stats{ "$path:$status" }{ queries }{ $query }++;
}

close $fh or die "Can't close '$filename' after reading: $!";

print_html_start();

my @ps = sort { $stats{ $b }{ sum } <=> $stats{ $a }{ sum } } keys %stats;
for my $ps ( @ps ) {

    my ( $path, $status ) = $ps =~ /(.*):(\d+)/;
    my $sum = $stats{ $ps }{ sum };

    my $section = $path;
    $prefix and $section = qq(<a href="$prefix$path">$section</a>);

    print "<h2>$section",
        qq( <span class="small">total: $sum, status: $status</span>),
        "</h2>\n";
    print_cloud_as_html_list(

        frequencies => $stats{ $ps }{ queries },
        steps => $steps,
        mapping => $mapping,
        sort => $sort,
        limit => $limit,
        scale => $scale,
    );
}

print_html_end( time - $time );
exit;


sub print_html_start {

    print <<"START";
<!DOCTYPE HTML PUBLIC "-//W3C//DTD HTML 4.01//EN"
 "http://www.w3.org/TR/html4/strict.dtd">
<html>
<head>
    <meta http-equiv="Content-Type" content="text/html; charset=ISO-8859-1">
    <title>Google Search Cloud (beta)</title>
    <link rel="stylesheet" type="text/css" href="gscloud.css">
</head>
<body>
<h1>Google <span class="blue">Search Cloud</span>
<span class="beta">beta</span></h1>
START
}


sub print_html_end {

    my $delta = shift;
    print <<FOOTER;
<div class="footer">
    <a href="http://johnbokma.com/perl/google-search-cloud.html">Google
    Search Cloud</a>, written by John Bokma, took $delta seconds to
    generate this page.
</div>
FOOTER
}


sub print_cloud_as_html_list {

    my %params = @_;

    my $frequencies = $params{ frequencies }
        or croak "Parameter 'frequencies' not given";

    my $steps = $params{ steps }
        or croak "Parameter 'steps' not given";

    my $mapping = $params{ mapping } || 'log';
    $mapping eq 'log' or $mapping eq 'lin'
        or croak "Parameter 'mapping' has an unsupported value ($mapping)";

    my $sort_method = $params{ sort } || 'alpha';
    $sort_method eq 'alpha' or $sort_method eq 'num'
        or croak "Parameter 'sort' has an unsupported value ($sort_method)";

    my @keys = sort
        { $frequencies->{ $b } <=> $frequencies->{ $a } } keys %$frequencies;

    # if there is a limit, take the top limit frequencies
    $params{ limit } and @keys = splice @keys, 0, $params{ limit };
    @keys or return;    # nothing to do

    $steps = @keys if $params{ scale } and $steps > @keys;
    my $max_steps = $steps - 1;

    my ( $max, $min ) = @$frequencies{ $keys[ 0 ], $keys[ -1 ] };

    print qq(<ul class="cloud">\n);

    my $step = $min == $max
        ? sub { 1 }
        : $mapping eq 'log'
            ? sub {

                1 + int( $max_steps * (
                    ( log( $frequencies->{ $_[ 0 ] } ) - log( $min )) /
                    ( log( $max ) - log( $min ) ) )
                )
            }
            : sub {

                1 + int( $max_steps *
                    ( $frequencies->{ $_[ 0 ] } - $min ) /
                    ( $max - $min )
                )

            };

    $sort_method eq 'alpha' and @keys = sort { lc $a cmp lc $b } @keys;

    print '  <li class="size' . $step->( $_ ) . '">',
        encode_entities( Encode::decode_utf8 $_ ), "</li>\n" for @keys;

    print "</ul>\n";
}

