[HarfBuzz] harfbuzz-ng: Branch 'master' - 5 commits

Behdad Esfahbod behdad at kemper.freedesktop.org
Wed May 9 01:12:41 PDT 2012


 test/shaping/Makefile.am      |    2 
 test/shaping/hb-diff-ngrams   |    5 +
 test/shaping/hb-diff-stat     |    5 +
 test/shaping/hb_test_tools.py |  174 +++++++++++++++++++++++++++++++++++++++---
 4 files changed, 177 insertions(+), 9 deletions(-)

New commits:
commit 2214a03900d32710573a1b05c7665195b3129761
Author: Behdad Esfahbod <behdad at behdad.org>
Date:   Wed May 9 09:54:54 2012 +0200

    Add hb-diff-ngrams

diff --git a/test/shaping/Makefile.am b/test/shaping/Makefile.am
index 81c9991..4fb762c 100644
--- a/test/shaping/Makefile.am
+++ b/test/shaping/Makefile.am
@@ -13,6 +13,7 @@ EXTRA_DIST += \
 	hb-diff \
 	hb-diff-colorize \
 	hb-diff-filter-failures \
+	hb-diff-ngrams \
 	hb-diff-stat \
 	hb-manifest-read \
 	hb-manifest-update \
diff --git a/test/shaping/hb-diff-ngrams b/test/shaping/hb-diff-ngrams
new file mode 100755
index 0000000..a496447
--- /dev/null
+++ b/test/shaping/hb-diff-ngrams
@@ -0,0 +1,5 @@
+#!/usr/bin/python
+
+from hb_test_tools import *
+
+UtilMains.process_multiple_files (DiffSinks.print_ngrams)
diff --git a/test/shaping/hb_test_tools.py b/test/shaping/hb_test_tools.py
index a38f067..3ff75b8 100644
--- a/test/shaping/hb_test_tools.py
+++ b/test/shaping/hb_test_tools.py
@@ -155,12 +155,60 @@ class DiffFilters:
 			if not DiffHelpers.test_passed (lines):
 				for l in lines: yield l
 
+class Stat:
+
+	def __init__ (self):
+		self.count = 0
+		self.freq = 0
+
+	def add (self, test):
+		self.count += 1
+		self.freq += test.freq
+
+class Stats:
+
+	def __init__ (self):
+		self.passed = Stat ()
+		self.failed = Stat ()
+		self.total  = Stat ()
+
+	def add (self, test):
+		self.total.add (test)
+		if test.passed:
+			self.passed.add (test)
+		else:
+			self.failed.add (test)
+
+	def mean (self):
+		return float (self.passed.count) / self.total.count
+
+	def variance (self):
+		return (float (self.passed.count) / self.total.count) * \
+		       (float (self.failed.count) / self.total.count)
+
+	def stddev (self):
+		return self.variance () ** .5
+
+	def zscore (self, population):
+		"""Calculate the standard score.
+		   Population is the Stats for population.
+		   Self is Stats for sample.
+		   Returns larger absolute value if sample is highly unlikely to be random.
+		   Anything outside of -3..+3 is very unlikely to be random.
+		   See: http://en.wikipedia.org/wiki/Standard_score"""
+
+		return (self.mean () - population.mean ()) / population.stddev ()
+
+
+
+
 class DiffSinks:
 
 	@staticmethod
 	def print_stat (f):
 		passed = 0
 		failed = 0
+		# XXX port to Stats, but that would really slow us down here
 		for key, lines in DiffHelpers.separate_test_cases (f):
 			if DiffHelpers.test_passed (lines):
 				passed += 1
@@ -172,21 +220,34 @@ class DiffSinks:
 	@staticmethod
 	def print_ngrams (f, ns=(1,2,3)):
 		gens = tuple (Ngram.generator (n) for n in ns)
+		allstats = Stats ()
+		allgrams = {}
 		for key, lines in DiffHelpers.separate_test_cases (f):
 			test = Test (lines)
-			unicodes = test.unicodes
-			del test
+			allstats.add (test)
 
 			for gen in gens:
-				print "Printing %d-grams:" % gen.n
-				for ngram in gen (unicodes):
-					print ngram
+				for ngram in gen (test.unicodes):
+					if ngram not in allgrams:
+						allgrams[ngram] = Stats ()
+					allgrams[ngram].add (test)
+
+		importantgrams = {}
+		for ngram, stats in allgrams.iteritems ():
+			if stats.failed.count >= 30: # for statistical reasons
+				importantgrams[ngram] = stats
+		allgrams = importantgrams
+		del importantgrams
+
+		for ngram, stats in allgrams.iteritems ():
+			print "zscore: %9f failed: %6d passed: %6d ngram: <%s>" % (stats.zscore (allstats), stats.failed.count, stats.passed.count, ','.join ("U+%04X" % u for u in ngram))
 
 
 
 class Test:
 
 	def __init__ (self, lines):
+		self.freq = 1
 		self.passed = True
 		self.identifier = None
 		self.text = None
commit 178e6dce01ad28c8708bad62ce0fb79c46e836dc
Author: Behdad Esfahbod <behdad at behdad.org>
Date:   Wed May 9 08:57:29 2012 +0200

    Add N-gram generator

diff --git a/test/shaping/hb_test_tools.py b/test/shaping/hb_test_tools.py
index d3c0939..a38f067 100644
--- a/test/shaping/hb_test_tools.py
+++ b/test/shaping/hb_test_tools.py
@@ -169,6 +169,53 @@ class DiffSinks:
 		total = passed + failed
 		print "%d out of %d tests passed.  %d failed (%g%%)" % (passed, total, failed, 100. * failed / total)
 
+	@staticmethod
+	def print_ngrams (f, ns=(1,2,3)):
+		gens = tuple (Ngram.generator (n) for n in ns)
+		for key, lines in DiffHelpers.separate_test_cases (f):
+			test = Test (lines)
+			unicodes = test.unicodes
+			del test
+
+			for gen in gens:
+				print "Printing %d-grams:" % gen.n
+				for ngram in gen (unicodes):
+					print ngram
+
+
+
+class Test:
+
+	def __init__ (self, lines):
+		self.passed = True
+		self.identifier = None
+		self.text = None
+		self.unicodes = None
+		self.glyphs = None
+		for l in lines:
+			symbol = l[0]
+			if symbol != ' ':
+				self.passed = False
+			i = 1
+			if ':' in l:
+				i = l.index (':')
+				if not self.identifier:
+					self.identifier = l[1:i]
+				i = i + 2 # Skip colon and space
+			j = -1
+			if l[j] == '\n':
+				j -= 1
+			brackets = l[i] + l[j]
+			l = l[i+1:-2]
+			if brackets == '()':
+				self.text = l
+			elif brackets == '<>':
+				self.unicodes = Unicode.parse (l)
+			elif brackets == '[]':
+				# XXX we don't handle failed tests here
+				self.glyphs = l
+
+
 class DiffHelpers:
 
 	@staticmethod
@@ -205,6 +252,23 @@ class FilterHelpers:
 		return printer
 
 
+class Ngram:
+
+	@staticmethod
+	def generator (n):
+
+		def gen (f):
+			l = []
+			for x in f:
+				l.append (x)
+				if len (l) == n:
+					yield tuple (l)
+					l[:1] = []
+
+		gen.n = n
+		return gen
+
+
 class UtilMains:
 
 	@staticmethod
@@ -276,10 +340,14 @@ class Unicode:
 		return '<' + u','.join ("U+%04X" % ord (u) for u in unicode (s, 'utf-8')).encode ('utf-8') + '>'
 
 	@staticmethod
-	def encode (s):
+	def parse (s):
 		s = re.sub (r"[<+>,\\uU\n	]", " ", s)
 		s = re.sub (r"0[xX]", " ", s)
-		return u''.join (unichr (int (x, 16)) for x in s.split (' ') if len (x)).encode ('utf-8')
+		return [int (x, 16) for x in s.split (' ') if len (x)]
+
+	@staticmethod
+	def encode (s):
+		return u''.join (unichr (x) for x in Unicode.parse (s)).encode ('utf-8')
 
 	shorthands = {
 		"ZERO WIDTH NON-JOINER": "ZWNJ",
commit 98669ceb77657d60435f2cb2e3fc18272c0a2c6a
Author: Behdad Esfahbod <behdad at behdad.org>
Date:   Wed May 9 08:16:15 2012 +0200

    Use groupby()

diff --git a/test/shaping/hb_test_tools.py b/test/shaping/hb_test_tools.py
index 70a9ce1..d3c0939 100644
--- a/test/shaping/hb_test_tools.py
+++ b/test/shaping/hb_test_tools.py
@@ -150,7 +150,8 @@ class DiffFilters:
 
 	@staticmethod
 	def filter_failures (f):
-		for lines in DiffHelpers.separate_test_cases (f):
+		for key, lines in DiffHelpers.separate_test_cases (f):
+			lines = list (lines)
 			if not DiffHelpers.test_passed (lines):
 				for l in lines: yield l
 
@@ -160,7 +161,7 @@ class DiffSinks:
 	def print_stat (f):
 		passed = 0
 		failed = 0
-		for lines in DiffHelpers.separate_test_cases (f):
+		for key, lines in DiffHelpers.separate_test_cases (f):
 			if DiffHelpers.test_passed (lines):
 				passed += 1
 			else:
@@ -176,22 +177,11 @@ class DiffHelpers:
 		   have a colon character, groups them by identifier,
 		   yielding lists of all lines with the same identifier.'''
 
-		acc = []
-		iden = None
-		for l in f:
-			if ':' not in l:
-				if acc: yield acc
-				acc = []
-				iden = None
-				yield [l]
-				continue
-			l_iden = l[1:l.index (':')]
-			if acc and iden != l_iden:
-				yield acc
-				acc = []
-			iden = l_iden
-			acc.append (l)
-		if acc: yield acc
+		def identifier (l):
+			if ':' in l[1:]:
+				return l[1:l.index (':')]
+			return l
+		return groupby (f, key=identifier)
 
 	@staticmethod
 	def test_passed (lines):
commit c438a14b62433db488b5c90854a4a3934adf3305
Author: Behdad Esfahbod <behdad at behdad.org>
Date:   Wed May 9 07:45:17 2012 +0200

    Add hb-diff-stat

diff --git a/test/shaping/Makefile.am b/test/shaping/Makefile.am
index f216c5d..81c9991 100644
--- a/test/shaping/Makefile.am
+++ b/test/shaping/Makefile.am
@@ -13,6 +13,7 @@ EXTRA_DIST += \
 	hb-diff \
 	hb-diff-colorize \
 	hb-diff-filter-failures \
+	hb-diff-stat \
 	hb-manifest-read \
 	hb-manifest-update \
 	hb-unicode-decode \
diff --git a/test/shaping/hb-diff-stat b/test/shaping/hb-diff-stat
new file mode 100755
index 0000000..81626e1
--- /dev/null
+++ b/test/shaping/hb-diff-stat
@@ -0,0 +1,5 @@
+#!/usr/bin/python
+
+from hb_test_tools import *
+
+UtilMains.process_multiple_files (DiffSinks.print_stat)
diff --git a/test/shaping/hb_test_tools.py b/test/shaping/hb_test_tools.py
index 17181ac..70a9ce1 100644
--- a/test/shaping/hb_test_tools.py
+++ b/test/shaping/hb_test_tools.py
@@ -151,9 +151,23 @@ class DiffFilters:
 	@staticmethod
 	def filter_failures (f):
 		for lines in DiffHelpers.separate_test_cases (f):
-			if any (l[0] != ' ' for l in lines):
+			if not DiffHelpers.test_passed (lines):
 				for l in lines: yield l
 
+class DiffSinks:
+
+	@staticmethod
+	def print_stat (f):
+		passed = 0
+		failed = 0
+		for lines in DiffHelpers.separate_test_cases (f):
+			if DiffHelpers.test_passed (lines):
+				passed += 1
+			else:
+				failed += 1
+		total = passed + failed
+		print "%d out of %d tests passed.  %d failed (%g%%)" % (passed, total, failed, 100. * failed / total)
+
 class DiffHelpers:
 
 	@staticmethod
@@ -175,10 +189,14 @@ class DiffHelpers:
 			if acc and iden != l_iden:
 				yield acc
 				acc = []
-				iden = l_iden
+			iden = l_iden
 			acc.append (l)
 		if acc: yield acc
 
+	@staticmethod
+	def test_passed (lines):
+		return all (l[0] == ' ' for l in lines)
+
 
 class FilterHelpers:
 
commit 1058d031e2046eb80331b0950eaff75c2bf608dc
Author: Behdad Esfahbod <behdad at behdad.org>
Date:   Wed May 9 07:30:07 2012 +0200

    Make hb-diff-filter-failtures retain all test info for failed tests

diff --git a/test/shaping/hb_test_tools.py b/test/shaping/hb_test_tools.py
index 03a7710..17181ac 100644
--- a/test/shaping/hb_test_tools.py
+++ b/test/shaping/hb_test_tools.py
@@ -149,16 +149,35 @@ class ZipDiffer:
 class DiffFilters:
 
 	@staticmethod
-	def filter_failures (f, symbols=diff_symbols):
-		for l in f:
-			if l[0] in symbols:
-				# TODO retain all lines of the failure
-				yield l
+	def filter_failures (f):
+		for lines in DiffHelpers.separate_test_cases (f):
+			if any (l[0] != ' ' for l in lines):
+				for l in lines: yield l
 
+class DiffHelpers:
 
-class ShapeFilters:
+	@staticmethod
+	def separate_test_cases (f):
+		'''Reads lines from f, and if the lines have identifiers, ie.
+		   have a colon character, groups them by identifier,
+		   yielding lists of all lines with the same identifier.'''
 
-	pass
+		acc = []
+		iden = None
+		for l in f:
+			if ':' not in l:
+				if acc: yield acc
+				acc = []
+				iden = None
+				yield [l]
+				continue
+			l_iden = l[1:l.index (':')]
+			if acc and iden != l_iden:
+				yield acc
+				acc = []
+				iden = l_iden
+			acc.append (l)
+		if acc: yield acc
 
 
 class FilterHelpers:



More information about the HarfBuzz mailing list