#!/usr/bin/perl -wT

#########################################################################
#
# Copyright 2006-2012 ClearFoundation
#
#########################################################################
#
# This program is free software; you can redistribute it and/or
# modify it under the terms of the GNU General Public License
# as published by the Free Software Foundation; either version 2
# of the License, or (at your option) any later version.
#
# This program is distributed in the hope that it will be useful,
# but WITHOUT ANY WARRANTY; without even the implied warranty of
# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
# GNU General Public License for more details.
#
# You should have received a copy of the GNU General Public License
# along with this program; if not, write to the Free Software
# Foundation, Inc., 59 Temple Place - Suite 330, Boston, MA  02111-1307, USA.
#
# Based on squidparse.pl by Simon Burns (http://squidalyser.sourceforge.net/)
# Heavily modified for Point Clark Networks by W.H Welch <whw3@whw3.com>
#
#########################################################################

#########################################################################
#
# PROXY AND CONTENT FILTER LOGS
#
# When the content filter is running, useful and unique information is
# provided in both the Squid and DansGuardian logs.  For example, the 
# "duration" of a page view is only available in Squid.  The "reason" for
# blocking a particular page is only available in DansGuardian.  From
# a practical view, the end-user could care less about where the data
# resides -- they just want the data.
# 
# For this reason, this script will merge the data from both logs into
# one table. 
#
#########################################################################

use strict;
use DBI;
use Digest::MD5 qw(md5 md5_hex md5_base64);
use Getopt::Std;
use IO::Zlib;
use Socket;
use Sys::Syslog qw(:DEFAULT setlogsock);
use Time::ParseDate;
use Fcntl;
use JSON;

#------------------------------------------------------------------------
# Initialization
#------------------------------------------------------------------------

$| = 1; # Flush output immediately
$0 = "proxy2db";

$ENV {'PATH'}  = '/sbin:/usr/bin:/bin';
$ENV {'SHELL'} = '/bin/sh';
delete @ENV {'ENV', 'BASH_ENV'};  # Perl security stuff - see "man perlsec"

#------------------------------------------------------------------------
# Functions
#------------------------------------------------------------------------

sub startup();
sub db_connect();
sub db_disconnect();
sub get_proxy_log_list();
sub get_filter_messages();
sub load_dataset();
sub load_filter_logs($$$);
sub log_msg($);
sub prep_maps();
sub process_proxy_log();
sub usage();

#------------------------------------------------------------------------
# Variable initalization
#------------------------------------------------------------------------

my $g_opt_string = 'fahv';
my %g_opt;
my $g_dbh;
my %g_unresolved;
my @g_filter_logs;
my %g_filter_messages;
my $g_filter_fileseek = 0;
my $g_proxy_fileseek = 0;
my %g_dataset;
my %g_filterset;
my $g_debug = 0;
my $g_dbcount = 0;
my $g_user_map;

my $FILE_GEOIP = "/usr/share/GeoIP/GeoLiteCity.dat";
my $FORMAT_SQUID = 1;
my $FORMAT_DANSGUARDIAN = 2;
my $DB_EXPIRE_UNIT = "w";
my $DB_EXPIRE_VALUE = "156";
my $SYSLOG_NAME = "local6";

#------------------------------------------------------------------------
# Usage
#------------------------------------------------------------------------

getopts("$g_opt_string", \%g_opt ) or usage();
usage() if defined $g_opt{h};

#------------------------------------------------------------------------
# Main
#------------------------------------------------------------------------

startup();
prep_maps();
db_connect();
get_filter_messages();
process_proxy_log();
db_disconnect();

# TODO: quick hack for beta
`/bin/rm /var/clearos/reports_database/cache/proxy/* -f`;

exit 0;

#########################################################################
#
# F U N C T I O N S
#
#########################################################################

#------------------------------------------------------------------------
# startup - do some general startup tasks
#------------------------------------------------------------------------

sub startup() {
    my $isrunning = `/sbin/pidof -x proxy2db`;
    chomp($isrunning);

    if ($isrunning && ($isrunning ne $$)) {
        log_msg("Already running.");
        print("Already running\n");
        exit 0;
    }
}

#------------------------------------------------------------------------
# log_msg - logs a message to syslog
#------------------------------------------------------------------------

sub log_msg($) {
    setlogsock('unix');
    openlog('proxy2db', 'ndelay', $SYSLOG_NAME);
    syslog('info', $_[0]);
    closelog();
}

#------------------------------------------------------------------------
# get_filter_messages - returns a hash array of filter messages
#
# The %g_filter_messages global variable is populated with the 
# DansGuardian translations and error code numbers found in
# /etc/dansguardian-av/languages/*/messages:
#
# %g_filter_messages{'Banned phrase found.'} = 301;
# %g_filter_messages{'Phrase interdite trouvee.'} = 301;
# ... 
# %g_filter_messages{'Banned site: '} = 500;
#
# See parse_dansguardian_log_line to find out why this is necessary.
#------------------------------------------------------------------------

sub get_filter_messages() {
    my @filelist;

    if (-d "/etc/dansguardian-av/languages") {
        @filelist = `/usr/bin/find /etc/dansguardian-av/languages -name messages`;
    } elsif (-d "/etc/dansguardian/languages") {
        @filelist = `/usr/bin/find /etc/dansguardian/languages -name messages`;
    } else {
        return;
    }

    foreach my $file (@filelist) {
        chomp($file);
        open(MESSAGES, $file) or next;

        while (<MESSAGES>) {
            $g_filter_messages{$2} = $1 if (/^"(\d+)","(.*)"$/);
        }
    }
}

#------------------------------------------------------------------------
# get_proxy_log_list - returns a list of log files to process.  
#------------------------------------------------------------------------

sub get_proxy_log_list() {
    my $directory = "/var/log/squid";
    my $basefile = "access.log";

    my @loglist;

    if (defined($g_opt{a})) {
        # Implies a -f / force
        $g_opt{f} = 1;

        opendir LOGDIR, $directory or die $!;

        while (my $logfile = readdir LOGDIR) {
            if (($logfile =~ /$basefile$/) || ($logfile =~ /$basefile.\d+$/) || ($logfile =~ /$basefile.\d+.gz$/)) {
                $logfile =~ s/$basefile//;
                $logfile =~ s/\.gz$//;
                push @loglist, "$logfile";
            }
        }

        @loglist = sort @loglist;
        @loglist = reverse @loglist;

        closedir LOGDIR; 

    } else {
        push @loglist, "" if ((-e "$directory/$basefile") || (-e "$directory/$basefile.gz"));
    }

    return @loglist;
}

#------------------------------------------------------------------------
# prep_maps - loads MAC and Username maps
#------------------------------------------------------------------------

sub prep_maps() {
    if (-e '/var/clearos/network_map/usernames.dat') {
        my $username_json = `cat /var/clearos/network_map/usernames.dat`;

        $g_user_map = decode_json($username_json);
    }
}

#------------------------------------------------------------------------
# process_proxy_log - load proxy log data.
#
# while (still_parsing_logs)
#   - load next 5000 lines from the proxy log into record set ($g_dataset)
#   - load proxy server logs with the same date range ($g_filterset)
#   - merge the two data sets (the proxy log is the "master" set)
#   - dump the merged data into the database
#------------------------------------------------------------------------

sub process_proxy_log() {

    # Determine which log files to process
    #-------------------------------------

    my @filenumbers = get_proxy_log_list();

    # Determine time of last run
    #---------------------------

    my $lastrun = 0;
    my $query = qq|select UNIX_TIMESTAMP(MAX(timestamp)) FROM proxy|;
    my $sth = $g_dbh->prepare($query);
    eval { $sth->execute; };

    if ($@) {
        if ($DBI::err) {
            # Repair the table and try again
            log_msg("Repairing proxy table.");
            $sth = $g_dbh->prepare("repair table proxy");
            $sth->execute or die "Could not execute statement: " . $sth->errstr;

            $sth = $g_dbh->prepare($query);
            $sth->execute or die "Could not execute statement: " . $sth->errstr;
        }
    }

    my $lastrunval = $sth->fetchrow;
    $lastrun = $lastrunval if ($lastrunval);
    $sth->finish;

    # Load the log data
    #------------------

    my $retval;
    my $linecount = 0;
    my $datacount = 0;
    my $date_from;
    my ($timestamp, $duration, $ip, $cache_code, $status, $bytes, $method, $request, $username, $hierarchy, $content_type, $domain, $md5);

    foreach my $filenumber (@filenumbers) {

        # Open the log file
        #------------------

        my $logfile = "/var/log/squid/access.log" . $filenumber;
    
        if (-e "$logfile.gz" ) {
            $logfile = "$logfile.gz";
            tie *PROXY, 'IO::Zlib', $logfile, "rb";
        } elsif (-e "$logfile" ) {
            open (PROXY, "< $logfile") or die $!;
        } else {
            next;
        }

        print "Processing $logfile ";
    
        # If this is the first time, we are efficient at find a good place 
        # to start scanning the file.
        #-----------------------------------------------------------------

        if ($g_proxy_fileseek == 0) {
            $timestamp = 0;
            my $safe_position = 0;
            my $current_position = 0;
            my $proxyline;
            
            while ($timestamp < $lastrun) {
                print "debug: searching proxy log at byte $current_position\n" if $g_debug;
                $safe_position = $current_position;
                $current_position = $current_position + 1000000;

                seek PROXY, $current_position, 0;
                $proxyline = <PROXY>; # First read may incomplete line
                $proxyline = <PROXY>;
                last if (!$proxyline);

                ($retval, $timestamp, $duration, $ip, $cache_code, $status, $bytes, $method, $request, $username, $hierarchy, $content_type, $domain, $md5) = parse_squid_log_line($proxyline);
            }

            $g_proxy_fileseek = $safe_position;

            seek PROXY, $g_proxy_fileseek, 0;
        }

        # Process the log file
        #---------------------

        while (my $line = <PROXY>) {
            $linecount++;
    
            ($retval, $timestamp, $duration, $ip, $cache_code, $status, $bytes, $method, $request, $username, $hierarchy, $content_type, $domain, $md5) = parse_squid_log_line($line);
            next if ($retval == 1);

            if (defined($g_opt{f}) || ($timestamp >= $lastrun)) {
                next if ($request =~ /:82.public.filtered.php/);

                $username = $g_user_map->{$ip} if ((($username =~ /^\s*$/) || ($username eq '-')) && (ref($g_user_map) eq 'HASH') && (defined($g_user_map->{$ip})));

                print "debug: loading proxy request $timestamp|$request\n" if $g_debug;
                $g_dataset{$timestamp . "|" . $request}->{timestamp} = $timestamp;
                $g_dataset{$timestamp . "|" . $request}->{duration} = $duration;
                $g_dataset{$timestamp . "|" . $request}->{ip} = $ip;
                $g_dataset{$timestamp . "|" . $request}->{cache_code} = $cache_code;
                $g_dataset{$timestamp . "|" . $request}->{status} = $status;
                $g_dataset{$timestamp . "|" . $request}->{bytes} = $bytes;
                $g_dataset{$timestamp . "|" . $request}->{method} = $method;
                $g_dataset{$timestamp . "|" . $request}->{request} = $request;
                $g_dataset{$timestamp . "|" . $request}->{username} = $username;
                $g_dataset{$timestamp . "|" . $request}->{content_type} = $content_type;
                $g_dataset{$timestamp . "|" . $request}->{domain} = $domain;
                $g_dataset{$timestamp . "|" . $request}->{md5} = $md5;
                $datacount++;

                if (($datacount % 3000) == 0){
                    load_filter_logs($date_from, $timestamp, $filenumber);
                    load_dataset();
                    $date_from = $timestamp;
                    print "." if (!$g_debug);
                } elsif ($datacount == 1) {
                    $date_from = $timestamp;
                }
            }
        }

        load_filter_logs($date_from, $timestamp, $filenumber);
        load_dataset();
    
        if ($logfile =~ /\.gz$/) {
            untie *PROXY;
        } else {
            close PROXY;
        }

        $g_proxy_fileseek = 0;

        print "\n";
    }

    # Show summary information
    if ($g_dbcount > 0) {
        print qq|Processed $linecount proxy log lines - $g_dbcount records added.\n|;
        log_msg("Processed $linecount proxy log lines - $g_dbcount records added.");
    }
}

#------------------------------------------------------------------------
# load_filter_logs
#------------------------------------------------------------------------
#
# Gotchas:
# - The time stamps in the proxy and filter logs can vary quite a bit
#   ... up to a few seconds.  A fuzz factor is used.
# - There's no guarantee that the order in the logs will be the same.
# - The file seek was an afterthought
#    
#--------------------------------------------------------------------

sub load_filter_logs($$$) {
    my $date_from = $_[0];
    my $date_to = $_[1];
    my $filenumber = $_[2];
    my $retval, my $timestamp, my $date, my $username, my $ip, my $request, my $code, my $detail, my $method, my $bytes, my $weight;

    # Fuzz the time a bit
    #--------------------

    $date_from =~ s/\..*//;
    $date_from = $date_from - 3;

    $date_to =~ s/\..*//;
    $date_to = $date_to + 3;

    # Open the file
    #--------------

    my $logfile = "/var/log/dansguardian/access.log$filenumber";

    if (-e "$logfile.gz" ) {
        $logfile = "$logfile.gz";
        tie *FILTER, 'IO::Zlib', $logfile, "rb";
    } elsif (-e "$logfile" ) {
        open (FILTER, "< $logfile") or die $!;
    } else {
        return;
    }

    print "debug: loading content filter logs between $date_from -> $date_to\n" if $g_debug;

    # If this is the first time, we are efficient at find a good place 
    # to start scanning the file.
    #-----------------------------------------------------------------

    if ($g_filter_fileseek == 0) {
        $timestamp = 0;
        my $safe_position = 0;
        my $current_position = 0;
        my $filterline;
        
        while ($timestamp < $date_from) {
            print "debug: searching content filter log at byte $current_position\n" if $g_debug;
            $safe_position = $current_position;
            $current_position = $current_position + 1000000;

            seek FILTER, $current_position, 0;
            $filterline = <FILTER>; # First read may incomplete line
            $filterline = <FILTER>;
            last if (!$filterline);

            ($retval, $timestamp, $date, $username, $ip, $request, $code, $detail, $method, $bytes, $weight) = parse_dansguardian_log_line($filterline);
        }

        $g_filter_fileseek = $safe_position;
    } else {
        $g_filter_fileseek = $g_filter_fileseek - 10000;
        $g_filter_fileseek = 0 if ($g_filter_fileseek  < 0);
    }

    # Set the location of the file and load data
    #-------------------------------------------

    seek FILTER, $g_filter_fileseek, 0;

    while (my $filterline = <FILTER>) {
        ($retval, $timestamp, $date, $username, $ip, $request, $code, $detail, $method, $bytes, $weight) = parse_dansguardian_log_line($filterline);

        next if ($retval == 1);

        if ($timestamp >= $date_to) {
            print "debug: finished content filter log processing\n" if $g_debug;
            $g_filter_fileseek = tell FILTER;
            close FILTER;
            return;
        } elsif (($timestamp >= $date_from) && ($timestamp <= $date_to)) {
            print "debug: loading content filter request $timestamp|$request\n" if $g_debug;
            $g_filterset{$timestamp . "|" . $request}->{filter_code} = $code;
            $g_filterset{$timestamp . "|" . $request}->{filter_detail} = $detail;
            $g_filterset{$timestamp . "|" . $request}->{filter_weight} = $weight;
            $g_filterset{$timestamp . "|" . $request}->{filter_ip} = $ip;

            # Classify the code values into sane types.  In theory, these can overlap 
            # (e.g. a malware *and* a blacklist site), but are mutually exclusive for now.

            if (!defined($code)) {
                $g_filterset{$timestamp . "|" . $request}->{filter_malware} = 0;
                $g_filterset{$timestamp . "|" . $request}->{filter_block} = 0;
                $g_filterset{$timestamp . "|" . $request}->{filter_blacklist} = 0;
            } elsif (($code == 1101) || ($code == 1100)) {
                $g_filterset{$timestamp . "|" . $request}->{filter_malware} = 1;
                $g_filterset{$timestamp . "|" . $request}->{filter_block} = 0;
                $g_filterset{$timestamp . "|" . $request}->{filter_blacklist} = 0;
            } elsif (($code == 500) || ($code == 501)) {
                $g_filterset{$timestamp . "|" . $request}->{filter_malware} = 0;
                $g_filterset{$timestamp . "|" . $request}->{filter_block} = 0;
                $g_filterset{$timestamp . "|" . $request}->{filter_blacklist} = 1;
            } elsif ((($code > 0) && ($code < 600)) || (($code >= 700) && ($code < 1200))) {
                $g_filterset{$timestamp . "|" . $request}->{filter_malware} = 0;
                $g_filterset{$timestamp . "|" . $request}->{filter_block} = 1;
                $g_filterset{$timestamp . "|" . $request}->{filter_blacklist} = 0;
            } else {
                $g_filterset{$timestamp . "|" . $request}->{filter_malware} = 0;
                $g_filterset{$timestamp . "|" . $request}->{filter_block} = 0;
                $g_filterset{$timestamp . "|" . $request}->{filter_blacklist} = 0;
            }
        }
    }

    if ($logfile =~ /\.gz$/) {
        untie *FILTER;
    } else {
        close FILTER;
    }
}

#------------------------------------------------------------------------
# load_dataset - loads current dataset into database
#------------------------------------------------------------------------

sub load_dataset() {

    print "debug: loading data sets into database\n" if $g_debug;

    # Merge the two data sets (proxy logs, filter logs)
    #--------------------------------------------------

    foreach my $key (keys %g_dataset) {
        my ($timestamp_proxy, $request_proxy) = split(/\|/, $key);
        my $timestamp_proxy_rounded = $timestamp_proxy;
        $timestamp_proxy_rounded =~ s/\..*//;

        foreach my $filterkey (keys %g_filterset) {
            my ($timestamp_filter, $request_filter) = split(/\|/, $filterkey);
            next if ($request_proxy ne $request_filter);

            my $match = 0;

            if ($timestamp_proxy_rounded == $timestamp_filter) {
                print "debug: match fuzz 0 $request_proxy @ " . $timestamp_proxy_rounded . "\n" if $g_debug;
                $match = 1;
            } elsif ($timestamp_proxy_rounded == ($timestamp_filter - 1)) {
                print "debug: match fuzz -1 $request_proxy @ " . $timestamp_proxy_rounded . "\n" if $g_debug;
                $match = 1;
            } elsif ($timestamp_proxy_rounded == ($timestamp_filter - 2)) {
                print "debug: match fuzz -2 $request_proxy @ " . $timestamp_proxy_rounded . "\n" if $g_debug;
                $match = 1;
            } elsif ($timestamp_proxy_rounded == ($timestamp_filter - 3)) {
                print "debug: match fuzz -3 $request_proxy @ " . $timestamp_proxy_rounded . "\n" if $g_debug;
                $match = 1;
            } elsif ($timestamp_proxy_rounded == ($timestamp_filter + 1)) {
                print "debug: match fuzz +1 $request_proxy @ " . $timestamp_proxy_rounded . "\n" if $g_debug;
                $match = 1;
            } elsif ($timestamp_proxy_rounded == ($timestamp_filter + 2)) {
                print "debug: match fuzz +2 $request_proxy @ " . $timestamp_proxy_rounded . "\n" if $g_debug;
                $match = 1;
            }

            if ($match == 1) {
                # Override IP in case X-Forward is disabled for some reason
                $g_dataset{$key}->{ip} = $g_filterset{$filterkey}->{filter_ip};
                $g_dataset{$key}->{filter_code} = $g_filterset{$filterkey}->{filter_code};
                $g_dataset{$key}->{filter_malware} = $g_filterset{$filterkey}->{filter_malware};
                $g_dataset{$key}->{filter_block} = $g_filterset{$filterkey}->{filter_block};
                $g_dataset{$key}->{filter_blacklist} = $g_filterset{$filterkey}->{filter_blacklist};
                $g_dataset{$key}->{filter_detail} = $g_filterset{$filterkey}->{filter_detail};
                $g_dataset{$key}->{filter_weight} = $g_filterset{$filterkey}->{filter_weight};
                $g_dataset{$key}->{filter_data} = 1;
                delete $g_filterset{$filterkey};
                last;
            }
        }
    }

    # Dump merged data into database
    #-------------------------------

    foreach my $key (keys %g_dataset) {
        if (exists($g_dataset{$key}->{filter_data}) && ($g_dataset{$key}->{filter_data} eq "1")) {
            print "debug: filtercode for $key: " . $g_dataset{$key}->{filter_code} . "\n" if $g_debug;
        } else {
            print "debug: filtercode missing for $key\n" if $g_debug;
            $g_dataset{$key}->{filter_code} = 0;
            $g_dataset{$key}->{filter_malware} = 0;
            $g_dataset{$key}->{filter_block} = 0;
            $g_dataset{$key}->{filter_blacklist} = 0;
            $g_dataset{$key}->{filter_detail} = "";
            $g_dataset{$key}->{filter_weight} = 0;
        }

        # Add a more useful base_domain to strip out those crazy URLs from CDNs.
        my $base_domain = $g_dataset{$key}->{domain};
        $base_domain =~ s/:.*//; # strip out port number

        $_ = $base_domain;

        if (! (/^[0-9\.]*$/)) {  # leave IPs alone
            my @parts = reverse(split(/\.+/, $base_domain));
            $#parts = 2 if ($#parts > 2);
            @parts = reverse(@parts);

            $base_domain = join('.', @parts);
        }

        # Insert into database
        my $query = qq|
            INSERT INTO proxy (timestamp, duration, ip, cache_code, status, bytes, method, request, username, content_type, domain, base_domain,  md5, filter_code, filter_detail, filter_malware, filter_block, filter_blacklist)
            VALUES ( FROM_UNIXTIME(?), ?, INET_ATON(?), ?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?)
        |;
    
        my $sth = $g_dbh->prepare($query);
    
        eval {
            $sth->execute(
                $g_dataset{$key}->{timestamp},
                $g_dataset{$key}->{duration},
                $g_dataset{$key}->{ip},
                $g_dataset{$key}->{cache_code},
                $g_dataset{$key}->{status},
                $g_dataset{$key}->{bytes},
                $g_dataset{$key}->{method},
                $g_dataset{$key}->{request},
                $g_dataset{$key}->{username},
                $g_dataset{$key}->{content_type},
                $g_dataset{$key}->{domain},
                $base_domain,
                $g_dataset{$key}->{md5},
                $g_dataset{$key}->{filter_code},
                $g_dataset{$key}->{filter_detail},
                $g_dataset{$key}->{filter_malware},
                $g_dataset{$key}->{filter_block},
                $g_dataset{$key}->{filter_blacklist}
            );
        };

        if ($@) {
            if ($DBI::err eq 1062) {
                # print "duplicate line: $md5\n";
            } elsif ($DBI::err) {
                die "Database error " . $DBI::errstr . "\n";
            }
        } else {
            $g_dbcount++;
        }
    }

    undef(%g_dataset);
    undef(%g_filterset);
    eval { $g_dbh->commit(); };
}

#------------------------------------------------------------------------
# parse_squid_log_line - parses a log line in Squid format
#
# Returns:
# - $retval (0 if parse was successful)
# - $timestamp
# - $duration
# - $ip
# - $cache_code  Squid result codes: http://www.squid-cache.org/Doc/FAQ/FAQ-6.html#ss6.7
# - $status       
# - $bytes
# - $method
# - $request
# - $username
# - $hierarchy
# - $content_type
# - $domain (base domain of the request)
# - $md5
#
# Squid log format:
#
# timestamp      | duration | ip    | cache_code/status            | bytes | method | request                      | username | hierarchy            | content-type
# -------------------------------------------------------------------------------------------------------------------------------------------------------------------
# 1156783571.630   218        127.0.0.1   TCP_MISS/200                   14676   GET      http://www.clarkconnect.com/   bob      DIRECT/69.90.141.108   text/html
#
#------------------------------------------------------------------------

sub parse_squid_log_line {
    my $line = shift or return;
    my $retval;
    my $timestamp;
    my $duration;
    my $ip;
    my $cache_code;
    my $status;
    my $bytes;
    my $method;
    my $request;
    my $username;
    my $hierarchy;
    my $content_type;
    my $domain;
    my $md5;

    if ($line =~ /^(\d{10,}\.\d{3,3})/) {

        my $full_code;
        ($timestamp, $duration, $ip, $full_code, $bytes, $method, $request, $username, $hierarchy, $content_type) = split(/\s+/, $line);
        ($cache_code, $status) = split(/\//, $full_code);

        # If we're using NTLM authentication, we have username in DOMAIN\User format,
        # so split domain and user (authuser) at \

        if ($username =~ /\\/) {
            my ($domain, $user) = split(/\\/,$username);
            $username = $user;
        }

        $request =~ s/\/$//g;
        $domain = $request;
        $domain =~ s/.*:\/\///;
        $domain =~ s/\/.*//;
        $md5 = md5_hex($line);

        $bytes = 0 unless ($bytes =~ /\d+/);
        $retval = 0;
    } else {
        $retval = 1;
    }

    return ($retval, $timestamp, $duration, $ip, $cache_code, $status, $bytes, $method, $request, $username, $hierarchy, $content_type, $domain, $md5);
}

#------------------------------------------------------------------------
# parse_dansguardian_log_line - parses a log line in DansGuardian format
#
# Returns:
# - $retval     - 0 if parse was successful)
# - $timestamp
# - $username
# - $ip
# - $request
# - $code       - see list /etc/dansguardian-av/languages/*/messages
# - $detail
# - $method
# - $bytes
#
# DansGuardian log format:
#
# timestamp           | username | ip        | request                     | code_type + code_text + detail | method | bytes 
# -----------------------------------------------------------------------------------------------------------------------------
# 2005.11.2 11:55:22    bob      192.168.2.212   http://www.clarkconnect.com                                    GET      14482
# 2006.8.28 13:03:38    bob      192.168.2.212   http://www.sex.com/           *DENIED* Banned site: sex.com    GET      0
#
# Version 2.9.9.x 
# - appended more fields
# - included "scanned" in the code without any details, e.g.:
# 2007.9.11 13:26:49 bob 192.168.2.137 http://www.clarkconnect.com/index.php *SCANNED*  GET 13991 -100  1 200 text/html   -
#
# ... weight | category | group | code | mimetype  | (ignore)
# -----------------------------------------------------------------------------------------------------------------------------
# ... -80                  1       200   text/html   
# ... -80      ?           1       200   text/html 
#
# The "code_type + code_text + detail" is not consistent which makes parsing fragile
# 1) the information is logged in localized format.
# 2) a standard field separator is not used.
# 3) code_type is optional
#
# To get around this issue, the get_filter_messages function loads
# ALL the local tags into a hash array and uses it to determine the code.
#------------------------------------------------------------------------

sub parse_dansguardian_log_line {
    my $line = shift or return;
    my $retval;
    my $timestamp;
    my $username;
    my $ip;
    my $request;
    my $code;
    my $detail;
    my $code_and_detail;
    my $method;
    my $bytes;
    my $weight;
    my $date;
    my $time;
    my $toeol;

    if ($line =~ /^(\d{4,}\.\d{1,2}\.\d{1,2})/) {
        $line =~ s/\*SCANNED\* //;  # Gets in the way

        ($date, $time, $username, $ip, $request, $toeol) = split(/ /, $line, 6);

        $date = "$date $time";
        $timestamp = parsedate($date);
        $code = 0;

        # Empty code information
        if ($toeol =~ /^ /) {
            $detail = "";
            $toeol =~ s/^ //;

        # Populated code field
        } elsif ($toeol =~ / (GET|CONNECT) /) {
            $detail = "unknown";

            $toeol =~ /(\*.*\*) (.*) (GET|CONNECT) .*/;
            $code_and_detail = $2;

            if ($code_and_detail) {
                foreach my $code_lookup (keys %g_filter_messages) {
                    my $regex = $code_lookup;
                    $regex =~ s/\[/\\[/g;
                    if ($code_and_detail =~ /$regex/) {
                        $code = $g_filter_messages{$code_lookup};
                        $detail = $code_and_detail;
                        $detail =~ s/$regex//;
                        $detail =~ s/^\s*//;
                    }
                }
            }

            $toeol =~ s/.* GET /GET /;
            $toeol =~ s/.* CONNECT /CONNECT /;
        }

        ($method, $bytes, $weight, $toeol) = split(/ /, $toeol, 4);

        $bytes = 0 unless ($bytes =~ /\d+/);
        $request =~ s/\/$//g;
        $request =~ s/\?.*/\?/;

        $retval = 0;
    } else {
        $retval = 1;
    }

    return ($retval, $timestamp, $date, $username, $ip, $request, $code, $detail, $method, $bytes, $weight);
}

#------------------------------------------------------------------------
# db_connect - initializes and connects to database
#------------------------------------------------------------------------

sub db_connect() {
    my $dbuser = "reports";
    my $dbname = "reports";
    my $dbport = "3308";
    my $dbhost = "127.0.0.1";
    my $dbpass;

    if (-e "/var/clearos/system_database/reports") {
        open (DBINFO, "/var/clearos/system_database/reports") or die $!;
        while(<DBINFO>) {
            if (/^password\s*=\s*/) {
                s/^password\s*=\s*//;
                chomp();
                $dbpass = $_;
            }
        }
    }

    system("/usr/sbin/initialize-report-tables proxy_report >/dev/null 2>&1");

    my $retry = 0;

    while ($retry < 3) {
        eval {
            $g_dbh = DBI->connect(
                "DBI:mysql:$dbname:$dbhost:$dbport",
                $dbuser,
                $dbpass,
                { RaiseError => 1, PrintError => 0, AutoCommit => 0 }
            );
        };

        if ($@) {
            if ($DBI::err eq 2003) {
                system("/sbin/service system-mysqld start >/dev/null 2>&1");
            } else {
                system("/usr/sbin/initialize-report-tables proxy_report >/dev/null 2>&1");
            }

            $retry++;
            sleep(3);
        } else {
            return;
        }
    }

    die "Failed to connect to database";
}

#------------------------------------------------------------------------
# db_disconnect - disconnects from database
#------------------------------------------------------------------------

sub db_disconnect() {
    if ($g_dbh) {
        $g_dbh->commit();
        $g_dbh->disconnect;
    }
}

#------------------------------------------------------------------------
# is_ipv4 - validates IP address
#------------------------------------------------------------------------

sub is_ipv4 {
    my $self = shift if ref($_[0]);
    my $value = shift;

    return unless defined($value);

    my(@octets) = $value =~ /^(\d{1,3})\.(\d{1,3})\.(\d{1,3})\.(\d{1,3})$/;
    return unless (@octets == 4);

    foreach (@octets) {
        return unless ($_ >= 0 && $_ <= 255);
    }

    return join('.', @octets);
}

#------------------------------------------------------------------------
# usage - usage output
#------------------------------------------------------------------------

sub usage() {
    print STDERR qq|
This program loads squid and dansguardian files into a mysql database.

    usage: $0 [-hvfa]

    -h        : this (help) message
    -v        : verbose output
    -f        : force insert
    -a        : all logs

    example: $0 -v -a

|;
    exit;
}
