#!/usr/bin/env python
"""
Blogalizer - Analyse an HTTP log in order to estimate the number
of subscribers to a blog / feed.

Usage:
python blogelizer.py log-file [feed-resource]

- log-file is the location of a standard HTTP log file
- feed-resource is a regular expression with matches
  if a resource (for example /blog/feed/rss) is the
  feed we're analysing.
"""

import sys
import re
import time

# Regular expression for parsing one line (of a GET request) in the HTTP log
re_req = re.compile('([0-9]{1,3}\.[0-9]{1,3}\.[0-9]{1,3}\.[0-9]{1,3}).+?\[(.+?)\].+?GET (/.+?) HTTP.+\"([^\"]+?)\"')

# Regular expression for identifying requests from public aggregators
re_public = re.compile('(.+?) \(.+?([0-9]{1,4}) ([Ss]ubscriber|[Rr]eader).+')

# Regular expression for identifying a request
# to the blog's feed
if len(sys.argv) > 2:
    # supplies as the second argument to the script
    re_feed = re.compile(sys.argv[2])
else:
    # Defaults to the setup of the LShift wordpress blog
    re_feed = re.compile('/blog(/feed.*|.+?feed=.+)')

def parse_timestamp(timestamp_str):
    date, hour, minutes, seconds_tz = timestamp_str.split(':')
    seconds = seconds_tz.split(' ')[0]
    day, month_name, year = date.split('/')
    month = ['Jan', 'Feb', 'Mar',
             'Apr', 'May', 'Jun',
             'Jul', 'Aug', 'Sep',
             'Oct', 'Nov', 'Dec'].index(month_name) + 1
    return time.mktime((int(year),int(month), int(day), int(hour), int(minutes), int(seconds), 0, 0, 0))

def record(line):
    """Parse a line from the HTTP log.
    If it is a request to the blog's feed
    return a dictionary with the relevant
    fields in the request.
    """
    match = re_req.match(line)
    if match:
        req_address, req_timestamp_str, req_resource, req_useragent = match.groups()
        # month, day, year = [int(x) for x in datestr.split("-", 2)]
        # hour, min, sec, ms = [int(x) for x in timestr.split(":")]
        # timestamp = datetime.datetime(year, month, day, hour, min, sec)
        req_timestamp = parse_timestamp(req_timestamp_str)
        if re_feed.match(req_resource):
            rec = {'address' : req_address,
                   'timestamp' : req_timestamp,
                   'resource' : req_resource,
                   'useragent' : req_useragent}
            return rec
    return None
    

# Collect all relevant data from
# the HTTP log.
# The files to parse is the first
# argument to the script
records = [record
           for record
           in [record(line)
               for line
               in open(sys.argv[1], 'r').read().split('\n')[:-1]]
           if record]

records.sort(lambda rec1, rec2: cmp(rec1['timestamp'], rec2['timestamp']))
last_timestamp = records[-1]['timestamp']
one_day = 24 * 60 * 60
first_timestamp = int(last_timestamp) - one_day

records = [record
           for record
           in records
           if record['timestamp'] <= last_timestamp
           and record['timestamp'] > first_timestamp]

# A map  from the IP addresses
# of private aggregators to
# their number of requests
private_addresses = {}

# A map from the identifiers
# of public aggregators to 
# a map from a resource to
# the number of reported subscribers
# for this resource
public_agents = {}

def is_public(record):
    """A predicate to determine whether
    a request is coming from a public aggregator.
    """
    return re_public.match(record['useragent'])

# Sort all requests to private and public.
# For the public ones, record the reported
# number of subscribers for each resource.
# For private ones, record the number of
# requests made
for record in records:
    if is_public(record):
        name, subscribers, _ = re_public.match(record['useragent']).groups()
        if not public_agents.has_key(name):
            public_agents[name] = {}
        public_agents[name][record['resource']] = int(subscribers)
    else:
        if not private_addresses.has_key(record['address']):
            private_addresses[record['address']] = 1
        else:
            private_addresses[record['address']] += 1

# The estimated number of private subscribers
# is all unique IP Addresses that made
# more than 2 requests a day
private_subscribers = len([address
                           for address
                           in private_addresses.values()
                           if address > 2])

timestamp_format = '%Y-%m-%d %H:%M:%S'

def format_timestamp(timestamp):
    return time.strftime(timestamp_format, time.localtime(timestamp))

print
print 'From %s To %s' % (format_timestamp(records[0]['timestamp']),
                         format_timestamp(last_timestamp))
print

print 'Private - %d' % private_subscribers

all_subscribers = private_subscribers

# The estimated number of subscribers
# using public aggregators is the number
# of reported subscribers for every
# resource by every aggregator
for name in public_agents.keys():
    agent_subscribers = sum(public_agents[name].values())
    all_subscribers += agent_subscribers
    print '%s - %d' % (name, agent_subscribers) 

print
print 'Total: ~ %d Subscribers' % all_subscribers
