[Libreoffice-commits] dev-tools.git: update-stats-geoip/analyser.py update-stats-geoip/compiler.py update-stats-geoip/.gitignore update-stats-geoip/README
Andrzej Hunt
andrzej.hunt at collabora.com
Fri Jan 24 11:39:46 PST 2014
update-stats-geoip/.gitignore | 3
update-stats-geoip/README | 25 +++++
update-stats-geoip/analyser.py | 50 ++++++++++
update-stats-geoip/compiler.py | 203 +++++++++++++++++++++++++++++++++++++++++
4 files changed, 281 insertions(+)
New commits:
commit b81dbf61684484263690043909334f17f85709c5
Author: Andrzej Hunt <andrzej.hunt at collabora.com>
Date: Thu Jan 23 15:43:30 2014 +0000
Add update-stats-geoip for tracking of updates pings per country.
Change-Id: Id0740a4ae484c96ad7456c1747c24ac5591966fa
Reviewed-on: https://gerrit.libreoffice.org/7613
Reviewed-by: Michael Meeks <michael.meeks at collabora.com>
Tested-by: Michael Meeks <michael.meeks at collabora.com>
diff --git a/update-stats-geoip/.gitignore b/update-stats-geoip/.gitignore
new file mode 100644
index 0000000..70ca242
--- /dev/null
+++ b/update-stats-geoip/.gitignore
@@ -0,0 +1,3 @@
+data
+GeoIP.dat
+storage-country-months
diff --git a/update-stats-geoip/README b/update-stats-geoip/README
new file mode 100644
index 0000000..163893f
--- /dev/null
+++ b/update-stats-geoip/README
@@ -0,0 +1,25 @@
+Produces a csv list of pings per country from the Update pings.
+
+
+1. Ensure you have the GeoLite Country Database, "GeoIP.dat", and place
+ it in the script folder:
+
+ http://dev.maxmind.com/geoip/legacy/geolite/
+
+2. Place the stats data in data/ (relative to the script folder)
+
+3. Run compiler.py -- this can take a few hours, and places it's temporary data
+ in storage-country-months/
+
+4. Run analyser.py (is fast)
+
+If you have updated the ping data in data/, re-running compiler.py should take
+much less time as it reuses the previously parsed data (although this isn't
+fully tested yet).
+
+To count unique IPs per month rather than update pings, for a quick hack solution
+it would probably be sufficient to replace line 117:
+ipHits[sReadableVersion][sIP] = ipHits.get(sReadableVersion, {}).get(sIP,0) + 1
+with
+ipHits[sReadableVersion][sIP] = 1
+
diff --git a/update-stats-geoip/analyser.py b/update-stats-geoip/analyser.py
new file mode 100755
index 0000000..14ca245
--- /dev/null
+++ b/update-stats-geoip/analyser.py
@@ -0,0 +1,50 @@
+#!/usr/bin/python
+
+# Introduces Python 3 style print
+from __future__ import print_function
+
+import argparse
+import collections
+import datetime
+import cPickle as pickle
+import os
+
+VERSIONLIST = set({"3.3", "3.4", "3.5", "3.6", "4.0", "4.1", "4.2"})
+
+parser = argparse.ArgumentParser(description='Process LO Update Ping data')
+#parser.addArgument('versions', metavar='V', type=int, nargs='+',
+ #help='The LO versions you would like to analyse (3.5, 3.6, 4.0, 4.1.)')
+
+args = parser.parse_args()
+
+
+sPrefix = "country-months"
+
+#sSep = "\t" # Easier on the eyes
+sSep = ";" # Easier on the software
+
+# Year/Month to start with.
+year = 2011
+month = 1
+
+sHeaderLine="Country" + sSep + "Version" + sSep + "YearMonth" + sSep + "hits"
+
+print(sHeaderLine)
+
+while (datetime.date(year, month, 1) + datetime.timedelta(days=20)) < datetime.date.today():
+ month += 1
+ if (month > 12):
+ year += 1
+ month = 1
+ aData = collections.defaultdict(dict)
+ for version in VERSIONLIST:
+ sFile = "storage-" + sPrefix + "/" + version + "/" + str(year) + "/" + str(month).zfill(2) + "/countryhits.dat"
+ if not os.path.exists(sFile):
+ continue;
+ with open(sFile, 'r') as aFile:
+ aData[version] = pickle.load(aFile)
+
+ for sVersion in VERSIONLIST:
+ for sCountry,nHits in aData.get(sVersion, {}).iteritems():
+ if (nHits > 300) and len(sCountry) > 0:
+ print(sCountry + sSep + sVersion + sSep + str(year)+str(month).zfill(2) + sSep + str(nHits))
diff --git a/update-stats-geoip/compiler.py b/update-stats-geoip/compiler.py
new file mode 100755
index 0000000..7192b3e
--- /dev/null
+++ b/update-stats-geoip/compiler.py
@@ -0,0 +1,203 @@
+#!/usr/bin/python
+#
+# You will need to get the GeoLite ASN database from:
+# http://dev.maxmind.com/geoip/legacy/geolite/
+#
+# No proper support for ipV6 addresses yet (i.e. geolocation/ISP data) -- they
+# are in a separate db, and only available in newer version of python-geoip (i.e.
+# would have to be built separately) so possibly not worth the bother yet?
+
+# Introduces Python 3 style print
+from __future__ import print_function
+
+import bz2
+import collections
+import ConfigParser
+import datetime
+import GeoIP
+import cPickle as pickle
+import re
+import sys
+import time
+import os.path
+
+#sResolution = "%W" # Split by week
+sResolution = "%m" # Split by month
+
+sPrefix = "country-months"
+
+
+sGEOIPFile = "GeoIP.dat"
+
+gi = GeoIP.open(sGEOIPFile,GeoIP.GEOIP_STANDARD)
+
+def getCountryForIP(sIP):
+ return str(gi.country_name_by_addr(sIP))
+
+dataLessDates = {
+ datetime.date(2013,1,3),
+ datetime.date(2013,2,28),
+ datetime.date(2013,3,6),
+ datetime.date(2013,3,7),
+ datetime.date(2013,4,11),
+ datetime.date(2013,4,12),
+ datetime.date(2013,4,13),
+ datetime.date(2013,4,14),
+ datetime.date(2013,8,28),
+ datetime.date(2013,8,29),
+ datetime.date(2013,8,30),
+ datetime.date(2013,8,31),
+ datetime.date(2013,9,1),datetime.date(2013,9,2),datetime.date(2013,9,3),datetime.date(2013,9,4),datetime.date(2013,9,5),datetime.date(2013,9,6),datetime.date(2013,9,7),
+ datetime.date(2013,9,8),datetime.date(2013,9,9),datetime.date(2013,9,10)
+ }
+
+VERSIONLIST = set({"3.3", "3.4", "3.5", "3.6", "4.0", "4.1", "4.2"})
+
+linePattern = re.compile('^([^ ]+) - - \[([^\/]+)\/([^\/]+)\/([^:]+):([0-9][0-9])[^\]]*\] "GET [^"]*" [^ ]+ [^ ]+ "[^"]*" "[^ ]* ([0-9]\.[0-9])[^(]*\(([^-;]+)[^;]*; ([^;]*);')
+
+print("*Analysing IPs...")
+
+config = ConfigParser.RawConfigParser()
+config.read('storage-' + sPrefix + '/compiler.cfg')
+
+
+if config.has_option('Main', 'last_year'):
+ currentFileDate = datetime.date(int(config.get('Main', 'last_year')), int(config.get('Main', 'last_month')), int(config.get('Main', 'last_day')))
+else:
+ currentFileDate = datetime.date(2012,04,27)
+
+def getCurrentFileName():
+ return "data/" + "update.libreoffice.org-access_log-" + currentFileDate.strftime("%Y%m%d") + ".bz2"
+
+sKnownIPsLocation = "storage-" + sPrefix + "/knownIPs.dat"
+
+knownIPs = set()
+if os.path.isfile(sKnownIPsLocation):
+ f = open(sKnownIPsLocation, 'r')
+ knownIPs = set(f.readlines())
+
+ipHits = collections.defaultdict(dict)
+currentWeek = ""
+
+newIPsOverall = set() # We keep a track of new IPs overall
+newIPs = collections.defaultdict(set) # But also new IPs associated with what version they downloaded
+
+lastDate = datetime.date(1980,1,1)
+
+while os.path.isfile(getCurrentFileName()) or currentFileDate in dataLessDates:
+ print(getCurrentFileName())
+
+ if (currentFileDate in dataLessDates):
+ currentFileDate += datetime.timedelta(days=1)
+ continue
+
+ with bz2.BZ2File(getCurrentFileName(), 'r') as aFile:
+ for line in aFile:
+ m = linePattern.split(line)
+ if len(m) > 1:
+ sIP = m[1]
+ sDay = m[2]
+ sMonth = m[3]
+ sYear = m[4]
+
+ currentDate = datetime.date(int(sYear), time.strptime(sMonth,'%b').tm_mon, int(sDay))
+
+ # Store the week, reinitialise counts -- should be refactored out
+ if currentWeek != currentDate.strftime(sResolution):
+ # We need the week before, i.e the week we just parsed
+ # The exact date we hit doesn't matter, but we need to shift
+ # an entire week since e.g. 6 days of data could be missing
+ # and currentDate *could* be the last day of the week.
+ storageDate = lastDate
+ if currentWeek != "":
+ for version in VERSIONLIST:
+ sDirectory = "storage-" + sPrefix + "/" + version + storageDate.strftime("/%Y/" + sResolution)
+ if not os.path.exists(sDirectory):
+ os.makedirs(sDirectory)
+ pickle.dump( ipHits.get(version, {}), open(sDirectory + "/iphits.dat", 'w' ))
+
+ # Create country mapping here:
+ for sVersion in VERSIONLIST:
+ countryHits = {} # Reuse for every version to save memory
+ for sIP,nHits in ipHits.get(sVersion, {}).iteritems():
+ sCountry = getCountryForIP(sIP)
+ if len(sCountry) > 0:
+ countryHits[sCountry] = countryHits.get(sCountry,0) + nHits
+
+ sDirectory = "storage-" + sPrefix + "/" + sVersion + storageDate.strftime("/%Y/" + sResolution)
+ if not os.path.exists(sDirectory):
+ os.makedirs(sDirectory)
+ pickle.dump( countryHits, open(sDirectory + "/countryhits.dat", 'w' ))
+ print("storing "+ sDirectory + "/countryhits.dat")
+
+ # Deal with new IPSortedByHits
+ # per version
+ for sVersion in VERSIONLIST:
+ sDirectory = "storage-" + sPrefix + "/" + version + storageDate.strftime("/%Y/" + sResolution)
+ file = open(sDirectory + "/newips.dat", 'w')
+ file.writelines( "%s\n" % item for item in newIPs[sVersion] )
+ # And total
+ sDirectory = "storage-" + sPrefix + "/" + "overall" + storageDate.strftime("/%Y/" + sResolution)
+ if not os.path.exists(sDirectory):
+ os.makedirs(sDirectory)
+ file = open(sDirectory + "/newips.dat", 'w')
+ file.writelines( "%s\n" % item for item in newIPs )
+ knownIPs.add(sIP)
+
+ # Cleanup
+ currentWeek = currentDate.strftime(sResolution)
+ print("Now on week " + currentDate.strftime(sResolution + " of %Y"))
+ ipHits = collections.defaultdict(dict)
+ newIPsOverall = set() # We keep a track of new IPs overall
+ newIPs = collections.defaultdict(set) # But also new IPs associated with what version they downloaded
+
+ # And read in existing data for this week
+ for version in VERSIONLIST:
+ sFile = "storage-" + sPrefix + "/" + version + currentDate.strftime("/%Y/" + sResolution) + "/iphits.dat"
+ if os.path.exists(sFile):
+ print("*********************************************************************************")
+ print("WARNING: data mis-ordered, we are reloading the following file, could be very wrong")
+ print(sFile)
+ #raw_input("Press Enter to continue: ")
+ ipHits[version] = pickle.load(open(sFile))
+ #pickle.dump( ipHits.get(sVersion, {}), open(sDirectory + "/iphits.dat", 'w' ))
+ print("reading "+ sFile)
+ # Countries are reprocessed on every write, so we can ignore them
+ # Unique IPs will be a mess if data isn't ordered, so ignore for now...
+ # TODO: deal with ^^^
+
+ lastDate = currentDate
+
+ #sHour = m[5]
+ #sOS = m[7] #Unused
+ sVersion = m[6] # Hash of version...
+
+ if sVersion in VERSIONLIST: # Some people hit the url with a browser, so various UAs here....
+ ipHits[sVersion][sIP] = ipHits.get(sVersion, {}).get(sIP,0) + 1
+
+ if sIP not in knownIPs:
+ newIPsOverall.add(sIP)
+ newIPs[sVersion].add(sIP)
+ # We don't add to the knownIPs list yet, as the data
+ # could get lost as we only write completed weeks
+ # of data, hence we update knownIPS in the week storage
+ # mechanism above
+ else:
+ print("Unknown version: " + line)
+
+ currentFileDate += datetime.timedelta(days=1)
+
+
+if not config.has_section('Main'):
+ config.add_section('Main')
+
+config.set('Main', 'last_year', currentFileDate.strftime("%Y"))
+config.set('Main', 'last_month', currentFileDate.strftime("%m"))
+config.set('Main', 'last_day', currentFileDate.strftime("%d"))
+
+config.write(open("storage-" + sPrefix + "/compiler.cfg", 'w'))
+
+file = open(sKnownIPsLocation, 'w')
+file.writelines( "%s\n" % item for item in knownIPs )
+
+print("*Completed successfully")
\ No newline at end of file
More information about the Libreoffice-commits
mailing list