#!/usr/bin/env python2.7 from collections import defaultdict from csv import reader from datetime import datetime from os import makedirs from os.path import exists, join from pickle import dumps, HIGHEST_PROTOCOL from urllib import unquote_plus from sys import argv def main(datadir, write_caches=True): totals = {} # setup views = defaultdict(int) saves = defaultdict(int) latest = 0 active_date = None with open(join(datadir, "event-log"), "rb") as f: csvr = reader(f, delimiter="\t", lineterminator="\n") for row in csvr: tstamp, eventtype, values = row if eventtype not in ("VIEWPAGE", "SAVEPAGE"): continue tstamp = long(tstamp) / 1000000.0 # latest = max(latest, tstamp) row_date = datetime.fromtimestamp(tstamp).strftime("%Y-%m-%d") if active_date != row_date: for pagename in totals: d = active_date v = views.get(pagename, 0) s = saves.get(pagename, 0) totals[pagename].append((d, v, s)) active_date = row_date views = defaultdict(int) saves = defaultdict(int) params = dict([pair.split("=", 1) for pair in values.split("&")]) pagename = unquote_plus(params.get("pagename", "")) # start counting this page from this date if pagename not in totals: totals[pagename] = [] if eventtype == "VIEWPAGE": if pagename: views[pagename] += 1 else: # SAVEPAGE if pagename: saves[pagename] += 1 # process last date for pagename in totals: d = active_date v = views.get(pagename, 0) s = saves.get(pagename, 0) totals[pagename].append((d, v, s)) for page, data in totals.iteritems(): # only produce cache for existing pages page_path = join(datadir, "pages", page) if not exists(page_path): continue cache_path = join(page_path, "cache") try: makedirs(cache_path) except: pass records = [latest] + [list(x) for x in zip(*data)] if write_caches: with open(join(cache_path, "hitcounts"), "wb") as cache: cache.write(dumps(tuple(records), HIGHEST_PROTOCOL)) else: # dry run print ( "%s: update ts(%f) latest date(%s) latest views(%d) latest saves(%d)" % (join(cache_path, "hitcounts"), records[0], records[1][-1], records[2][-1], records[3][-1]) ) if __name__ == "__main__": assert len(argv) >= 2, "usage: %s [-w] MOIN_DATA_DIR" % argv[0] # dry run unless -w specified main(argv[1], "-w" in argv)