[HarfBuzz] harfbuzz: Branch 'master' - 3 commits

Thu Apr 9 10:24:46 PDT 2009

contrib/README                          |    9 +
 contrib/harfbuzz-freetype.c             |  149 ++++++++++++++++++
 contrib/harfbuzz-freetype.h             |    9 +
 contrib/harfbuzz-unicode-glib.c         |  169 ++++++++++++++++++++
 contrib/harfbuzz-unicode-tables.c       |   84 ++++++++++
 contrib/harfbuzz-unicode.c              |  264 ++++++++++++++++++++++++++++++++
 contrib/harfbuzz-unicode.h              |   54 ++++++
 contrib/tables/README                   |   17 ++
 contrib/tables/category-parse.py        |   70 ++++++++
 contrib/tables/combining-class-parse.py |   34 ++++
 contrib/tables/grapheme-break-parse.py  |   45 +++++
 contrib/tables/scripts-parse.py         |   75 +++++++++
 contrib/tables/unicode_parse_common.py  |   70 ++++++++
 src/harfbuzz-shaper.h                   |    6 
 tests/fuzzing/fuzz.cc                   |  124 +++++++++++++++
 15 files changed, 1177 insertions(+), 2 deletions(-)

New commits:
commit 48b090ba2c5327a4b4acd14b36e97a70bc60feac
Author: Adam Langley <agl at google.com>
Date:   Mon Apr 6 12:43:27 2009 -0700

    Make HB_ShaperItem a typedef to a named struct.
    
    Previously, HB_ShaperItem was a typedef to an anonymous struct. This precludes
    forward declarations.

diff --git a/src/harfbuzz-shaper.h b/src/harfbuzz-shaper.h
index e8f5513..1577b59 100644
--- a/src/harfbuzz-shaper.h
+++ b/src/harfbuzz-shaper.h
@@ -242,7 +242,9 @@ typedef struct HB_Font_ {
     void *userData;
 } HB_FontRec;
 
-typedef struct {
+typedef struct HB_ShaperItem_ HB_ShaperItem;
+
+struct HB_ShaperItem_ {
     const HB_UChar16 *string;
     hb_uint32 stringLength;
     HB_ScriptItem item;
@@ -262,7 +264,7 @@ typedef struct {
 
     /* internal */
     HB_Bool kerning_applied; /* out: kerning applied by shaper */
-} HB_ShaperItem;
+};
 
 HB_Bool HB_ShapeItem(HB_ShaperItem *item);
 
commit ccce4eb0f8f42c5701b9e010ebb64a417a9d3a1b
Author: Adam Langley <agl at google.com>
Date:   Mon Apr 6 12:36:49 2009 -0700

    Add fuzz testing tool.

diff --git a/tests/fuzzing/fuzz.cc b/tests/fuzzing/fuzz.cc
new file mode 100644
index 0000000..133577a
--- /dev/null
+++ b/tests/fuzzing/fuzz.cc
@@ -0,0 +1,124 @@
+// This is a fuzzing harness for Harfbuzz. Since Harfbuzz's input is generally
+// expected to be controlled by a remote party it's a possible vector for
+// security issues.
+//
+// Fuzzing is a black-box testing scheme where the black-box (Harfbuzz's shaping
+// engine in this case) is fed random input to see if it will misbehave.
+// Misbehaviours can often be turned into security or crash issues.
+//
+// It's expected that one will generally run this under valgrind in order to get
+// better detection of problems.
+
+#include <stdint.h>
+#include <stdio.h>
+
+#include <ft2build.h>
+#include FT_FREETYPE_H
+
+#include "../../src/harfbuzz-shaper.h"
+#include "../../src/harfbuzz-global.h"
+#include "../../src/harfbuzz-gpos.h"
+
+extern "C" {
+#include "../../contrib/harfbuzz-unicode.h"
+#include "../../contrib/harfbuzz-freetype.h"
+}
+
+static FT_Library freetype;
+
+static FT_Face loadFace(const char *path)
+{
+  FT_Face face;
+
+  if (FT_New_Face(freetype, path, /* index */ 0, &face))
+      return 0;
+  return face;
+}
+
+static const int kWidth = 100;
+static const int kHeight = 100;
+
+static int
+usage(const char *argv0) {
+  fprintf(stderr, "Usage: %s <TTF file>\n", argv0);
+  return 1;
+}
+
+int
+main(int argc, char **argv) {
+  FT_Init_FreeType(&freetype);
+
+  if (argc != 2)
+    return usage(argv[0]);
+
+  FT_Face face;
+  if (FT_New_Face(freetype, argv[1], 0 /* face index */, &face)) {
+    fprintf(stderr, "Failed to load font file\n");
+    return 1;
+  }
+
+  HB_Face hbFace = HB_NewFace(face, hb_freetype_table_sfnt_get);
+
+  HB_FontRec hbFont;
+  hbFont.klass = &hb_freetype_class;
+  hbFont.userData = face;
+  hbFont.x_ppem  = face->size->metrics.x_ppem;
+  hbFont.y_ppem  = face->size->metrics.y_ppem;
+  hbFont.x_scale = face->size->metrics.x_scale;
+  hbFont.y_scale = face->size->metrics.y_scale;
+
+  // This is the maximum number of bytes of input which we'll feed to Harfbuzz
+  // in one shot. We also overload it and make it the size of the output arrays
+  // as well. (Must be a power of two.)
+  static const unsigned kMaxInputBytes = 1024;
+  uint8_t str[kMaxInputBytes];
+
+  HB_ShaperItem shaper_item;
+  shaper_item.kerning_applied = false;
+  shaper_item.string = (HB_UChar16 *) str;
+  shaper_item.stringLength = 0;
+  shaper_item.item.bidiLevel = 0;
+  shaper_item.shaperFlags = 0;
+  shaper_item.font = &hbFont;
+  shaper_item.face = hbFace;
+  shaper_item.glyphIndicesPresent = false;
+  shaper_item.initialGlyphCount = 0;
+
+  HB_Glyph out_glyphs[kMaxInputBytes];
+  HB_GlyphAttributes out_attrs[kMaxInputBytes];
+  HB_Fixed out_advs[kMaxInputBytes];
+  HB_FixedPoint out_offsets[kMaxInputBytes];
+  unsigned short out_logClusters[kMaxInputBytes];
+
+  shaper_item.glyphs = out_glyphs;
+  shaper_item.attributes = out_attrs;
+  shaper_item.advances = out_advs;
+  shaper_item.offsets = out_offsets;
+  shaper_item.log_clusters = out_logClusters;
+  shaper_item.num_glyphs = kMaxInputBytes;
+
+  FILE *urandom = fopen("/dev/urandom", "rb");
+  if (!urandom) {
+    fprintf(stderr, "Cannot open /dev/urandom\n");
+    return 1;
+  }
+
+  for (;;) {
+    uint16_t len;
+    fread(&len, sizeof(len), 1, urandom);
+    len &= (kMaxInputBytes - 1);
+    len &= ~1;
+    fread(str, len, 1, urandom);
+
+    ssize_t iterator = 0;
+
+    for (;;) {
+      if (!hb_utf16_script_run_next(NULL, &shaper_item.item, (uint16_t *) str, len >> 1, &iterator))
+        break;
+
+      HB_ShapeItem(&shaper_item);
+    }
+  }
+
+  HB_FreeFace(hbFace);
+}
commit 652d766d87b14cb6e123c878e437ade439a0814c
Author: Adam Langley <agl at google.com>
Date:   Mon Apr 6 12:12:17 2009 -0700

    Add contrib/ and initial file set
    
    This set of source files contains code to host Harfbuzz without Qt4 or Pango by
    downloading Unicode tables and providing basic functions to supply the symbols
    required by harfbuzz-external.h

diff --git a/contrib/README b/contrib/README
new file mode 100644
index 0000000..074cc52
--- /dev/null
+++ b/contrib/README
@@ -0,0 +1,9 @@
+Harfbuzz requires several functions to be defined in order to work with the
+platform's Unicode tables etc.
+
+If you are building on top of Qt4 you should look at the code in the tests/
+directory for examples of how to hook up Qt4 functions to Harfbuzz.
+
+Otherwise, this directory contains examples of using downloaded Unicode tables
+and/or glib to host Harfbuzz. You should read the README file in tables/ for how
+to build the header files for some of the Unicode tables.
diff --git a/contrib/harfbuzz-freetype.c b/contrib/harfbuzz-freetype.c
new file mode 100644
index 0000000..a2962df
--- /dev/null
+++ b/contrib/harfbuzz-freetype.c
@@ -0,0 +1,149 @@
+#include <stdint.h>
+
+#include <ft2build.h>
+#include FT_FREETYPE_H
+#include FT_TRUETYPE_TABLES_H
+
+#if 0
+#include <freetype/freetype.h>
+#include <freetype/tttables.h>
+#endif
+
+#include <harfbuzz-shaper.h>
+#include "harfbuzz-unicode.h"
+
+static HB_Bool
+hb_freetype_string_to_glyphs(HB_Font font,
+                             const HB_UChar16 *chars, hb_uint32 len,
+                             HB_Glyph *glyphs, hb_uint32 *numGlyphs,
+                             HB_Bool is_rtl) {
+  FT_Face face = (FT_Face) font->userData;
+  if (len > *numGlyphs)
+    return 0;
+
+  size_t i = 0, j = 0;
+  while (i < len) {
+    const uint32_t cp = utf16_to_code_point(chars, len, &i);
+    glyphs[j++] = FT_Get_Char_Index(face, cp);
+  }
+
+  *numGlyphs = j;
+
+  return 1;
+}
+
+static void
+hb_freetype_advances_get(HB_Font font, const HB_Glyph *glyphs, hb_uint32 len,
+                         HB_Fixed *advances, int flags) {
+  FT_Face face = (FT_Face) font->userData;
+
+  hb_uint32 i;
+  for (i = 0; i < len; ++i) {
+    const FT_Error error = FT_Load_Glyph(face, glyphs[i], FT_LOAD_DEFAULT);
+    if (error) {
+      advances[i] = 0;
+      continue;
+    }
+
+    advances[i] = face->glyph->advance.x;
+  }
+}
+
+static HB_Bool
+hb_freetype_can_render(HB_Font font, const HB_UChar16 *chars, hb_uint32 len) {
+  FT_Face face = (FT_Face)font->userData;
+
+  size_t i = 0;
+  while (i < len) {
+    const uint32_t cp = utf16_to_code_point(chars, len, &i);
+    if (FT_Get_Char_Index(face, cp) == 0)
+      return 0;
+  }
+
+  return 1;
+}
+
+static HB_Error
+hb_freetype_outline_point_get(HB_Font font, HB_Glyph glyph, int flags,
+                              hb_uint32 point, HB_Fixed *xpos, HB_Fixed *ypos,
+                              hb_uint32 *n_points) {
+  HB_Error error = HB_Err_Ok;
+  FT_Face face = (FT_Face) font->userData;
+
+  int load_flags = (flags & HB_ShaperFlag_UseDesignMetrics) ? FT_LOAD_NO_HINTING : FT_LOAD_DEFAULT;
+
+  if ((error = (HB_Error) FT_Load_Glyph(face, glyph, load_flags)))
+    return error;
+
+  if (face->glyph->format != ft_glyph_format_outline)
+    return (HB_Error)HB_Err_Invalid_SubTable;
+
+  *n_points = face->glyph->outline.n_points;
+  if (!(*n_points))
+    return HB_Err_Ok;
+
+  if (point > *n_points)
+    return (HB_Error)HB_Err_Invalid_SubTable;
+
+  *xpos = face->glyph->outline.points[point].x;
+  *ypos = face->glyph->outline.points[point].y;
+
+  return HB_Err_Ok;
+}
+
+static void
+hb_freetype_glyph_metrics_get(HB_Font font, HB_Glyph glyph,
+                              HB_GlyphMetrics *metrics) {
+  FT_Face face = (FT_Face) font->userData;
+
+  const FT_Error error = FT_Load_Glyph(face, glyph, FT_LOAD_DEFAULT);
+  if (error) {
+    metrics->x = metrics->y = metrics->width = metrics->height = 0;
+    metrics->xOffset = metrics->yOffset = 0;
+    return;
+  }
+
+  const FT_Glyph_Metrics *ftmetrics = &face->glyph->metrics;
+  metrics->width = ftmetrics->width;
+  metrics->height = ftmetrics->height;
+  metrics->x = ftmetrics->horiAdvance;
+  metrics->y = 0;  // unclear what this is
+  metrics->xOffset = ftmetrics->horiBearingX;
+  metrics->yOffset = ftmetrics->horiBearingY;
+}
+
+static HB_Fixed
+hb_freetype_font_metric_get(HB_Font font, HB_FontMetric metric) {
+  FT_Face face = (FT_Face) font->userData;
+
+  switch (metric) {
+  case HB_FontAscent:
+    // Note that we aren't scanning the VDMX table which we probably would in
+    // an ideal world.
+    return face->ascender;
+  default:
+    return 0;
+  }
+}
+
+const HB_FontClass hb_freetype_class = {
+  hb_freetype_string_to_glyphs,
+  hb_freetype_advances_get,
+  hb_freetype_can_render,
+  hb_freetype_outline_point_get,
+  hb_freetype_glyph_metrics_get,
+  hb_freetype_font_metric_get,
+};
+
+HB_Error
+hb_freetype_table_sfnt_get(void *voidface, const HB_Tag tag, HB_Byte *buffer, HB_UInt *len) {
+  FT_Face face = (FT_Face) voidface;
+  FT_ULong ftlen = *len;
+
+  if (!FT_IS_SFNT(face))
+    return HB_Err_Invalid_Argument;
+
+  const FT_Error error = FT_Load_Sfnt_Table(face, tag, 0, buffer, &ftlen);
+  *len = ftlen;
+  return (HB_Error) error;
+}
diff --git a/contrib/harfbuzz-freetype.h b/contrib/harfbuzz-freetype.h
new file mode 100644
index 0000000..628be16
--- /dev/null
+++ b/contrib/harfbuzz-freetype.h
@@ -0,0 +1,9 @@
+#ifndef HB_FREETYPE_H_
+#define HB_FREETYPE_H_
+
+extern const HB_FontClass hb_freetype_class;
+
+HB_Error hb_freetype_table_sfnt_get(void *voidface, const HB_Tag tag,
+                                    HB_Byte *buffer, HB_UInt *len);
+
+#endif  // HB_FREETYPE_H_
diff --git a/contrib/harfbuzz-unicode-glib.c b/contrib/harfbuzz-unicode-glib.c
new file mode 100644
index 0000000..6a13433
--- /dev/null
+++ b/contrib/harfbuzz-unicode-glib.c
@@ -0,0 +1,169 @@
+#include "harfbuzz-external.h"
+
+#include <glib.h>
+
+static int
+hb_category_for_char(HB_UChar32 ch) {
+  switch (g_unichar_type(ch)) {
+    case G_UNICODE_CONTROL:
+      return HB_Other_Control;
+    case G_UNICODE_FORMAT:
+      return HB_Other_Format;
+    case G_UNICODE_UNASSIGNED:
+      return HB_Other_NotAssigned;
+    case G_UNICODE_PRIVATE_USE:
+      return HB_Other_PrivateUse;
+    case G_UNICODE_SURROGATE:
+      return HB_Other_Surrogate;
+    case G_UNICODE_LOWERCASE_LETTER:
+      return HB_Letter_Lowercase;
+    case G_UNICODE_MODIFIER_LETTER:
+      return HB_Letter_Modifier;
+    case G_UNICODE_OTHER_LETTER:
+      return HB_Letter_Other;
+    case G_UNICODE_TITLECASE_LETTER:
+      return HB_Letter_Titlecase;
+    case G_UNICODE_UPPERCASE_LETTER:
+      return HB_Letter_Uppercase;
+    case G_UNICODE_COMBINING_MARK:
+      return HB_Mark_SpacingCombining;
+    case G_UNICODE_ENCLOSING_MARK:
+      return HB_Mark_Enclosing;
+    case G_UNICODE_NON_SPACING_MARK:
+      return HB_Mark_NonSpacing;
+    case G_UNICODE_DECIMAL_NUMBER:
+      return HB_Number_DecimalDigit;
+    case G_UNICODE_LETTER_NUMBER:
+      return HB_Number_Letter;
+    case G_UNICODE_OTHER_NUMBER:
+      return HB_Number_Other;
+    case G_UNICODE_CONNECT_PUNCTUATION:
+      return HB_Punctuation_Connector;
+    case G_UNICODE_DASH_PUNCTUATION:
+      return HB_Punctuation_Dash;
+    case G_UNICODE_CLOSE_PUNCTUATION:
+      return HB_Punctuation_Close;
+    case G_UNICODE_FINAL_PUNCTUATION:
+      return HB_Punctuation_FinalQuote;
+    case G_UNICODE_INITIAL_PUNCTUATION:
+      return HB_Punctuation_InitialQuote;
+    case G_UNICODE_OTHER_PUNCTUATION:
+      return HB_Punctuation_Other;
+    case G_UNICODE_OPEN_PUNCTUATION:
+      return HB_Punctuation_Open;
+    case G_UNICODE_CURRENCY_SYMBOL:
+      return HB_Symbol_Currency;
+    case G_UNICODE_MODIFIER_SYMBOL:
+      return HB_Symbol_Modifier;
+    case G_UNICODE_MATH_SYMBOL:
+      return HB_Symbol_Math;
+    case G_UNICODE_OTHER_SYMBOL:
+      return HB_Symbol_Other;
+    case G_UNICODE_LINE_SEPARATOR:
+      return HB_Separator_Line;
+    case G_UNICODE_PARAGRAPH_SEPARATOR:
+      return HB_Separator_Paragraph;
+    case G_UNICODE_SPACE_SEPARATOR:
+      return HB_Separator_Space;
+    default:
+      return HB_Symbol_Other;
+  }
+}
+
+HB_LineBreakClass
+HB_GetLineBreakClass(HB_UChar32 ch) {
+  switch (g_unichar_break_type(ch)) {
+    case G_UNICODE_BREAK_MANDATORY:
+      return HB_LineBreak_BK;
+    case G_UNICODE_BREAK_CARRIAGE_RETURN:
+      return HB_LineBreak_CR;
+    case G_UNICODE_BREAK_LINE_FEED:
+      return HB_LineBreak_LF;
+    case G_UNICODE_BREAK_COMBINING_MARK:
+      return HB_LineBreak_CM;
+    case G_UNICODE_BREAK_SURROGATE:
+      return HB_LineBreak_SG;
+    case G_UNICODE_BREAK_ZERO_WIDTH_SPACE:
+      return HB_LineBreak_ZW;
+    case G_UNICODE_BREAK_INSEPARABLE:
+      return HB_LineBreak_IN;
+    case G_UNICODE_BREAK_NON_BREAKING_GLUE:
+      return HB_LineBreak_GL;
+    case G_UNICODE_BREAK_CONTINGENT:
+      return HB_LineBreak_AL;
+    case G_UNICODE_BREAK_SPACE:
+      return HB_LineBreak_SP;
+    case G_UNICODE_BREAK_AFTER:
+      return HB_LineBreak_BA;
+    case G_UNICODE_BREAK_BEFORE:
+      return HB_LineBreak_BB;
+    case G_UNICODE_BREAK_BEFORE_AND_AFTER:
+      return HB_LineBreak_B2;
+    case G_UNICODE_BREAK_HYPHEN:
+      return HB_LineBreak_HY;
+    case G_UNICODE_BREAK_NON_STARTER:
+      return HB_LineBreak_NS;
+    case G_UNICODE_BREAK_OPEN_PUNCTUATION:
+      return HB_LineBreak_OP;
+    case G_UNICODE_BREAK_CLOSE_PUNCTUATION:
+      return HB_LineBreak_CL;
+    case G_UNICODE_BREAK_QUOTATION:
+      return HB_LineBreak_QU;
+    case G_UNICODE_BREAK_EXCLAMATION:
+      return HB_LineBreak_EX;
+    case G_UNICODE_BREAK_IDEOGRAPHIC:
+      return HB_LineBreak_ID;
+    case G_UNICODE_BREAK_NUMERIC:
+      return HB_LineBreak_NU;
+    case G_UNICODE_BREAK_INFIX_SEPARATOR:
+      return HB_LineBreak_IS;
+    case G_UNICODE_BREAK_SYMBOL:
+      return HB_LineBreak_SY;
+    case G_UNICODE_BREAK_ALPHABETIC:
+      return HB_LineBreak_AL;
+    case G_UNICODE_BREAK_PREFIX:
+      return HB_LineBreak_PR;
+    case G_UNICODE_BREAK_POSTFIX:
+      return HB_LineBreak_PO;
+    case G_UNICODE_BREAK_COMPLEX_CONTEXT:
+      return HB_LineBreak_SA;
+    case G_UNICODE_BREAK_AMBIGUOUS:
+      return HB_LineBreak_AL;
+    case G_UNICODE_BREAK_UNKNOWN:
+      return HB_LineBreak_AL;
+    case G_UNICODE_BREAK_NEXT_LINE:
+      return HB_LineBreak_AL;
+    case G_UNICODE_BREAK_WORD_JOINER:
+      return HB_LineBreak_WJ;
+    case G_UNICODE_BREAK_HANGUL_L_JAMO:
+      return HB_LineBreak_JL;
+    case G_UNICODE_BREAK_HANGUL_V_JAMO:
+      return HB_LineBreak_JV;
+    case G_UNICODE_BREAK_HANGUL_T_JAMO:
+      return HB_LineBreak_JT;
+    case G_UNICODE_BREAK_HANGUL_LV_SYLLABLE:
+      return HB_LineBreak_H2;
+    case G_UNICODE_BREAK_HANGUL_LVT_SYLLABLE:
+      return HB_LineBreak_H3;
+    default:
+      return HB_LineBreak_AL;
+  }
+}
+
+int
+HB_GetUnicodeCharCombiningClass(HB_UChar32 ch) {
+  return g_unichar_combining_class(ch);
+}
+
+void
+HB_GetUnicodeCharProperties(HB_UChar32 ch,
+                            HB_CharCategory *category,
+                            int *combiningClass) {
+  *category = hb_category_for_char(ch);
+  *combiningClass = g_unichar_combining_class(ch);
+}
+
+HB_CharCategory
+HB_GetUnicodeCharCategory(HB_UChar32 ch) {
+  return hb_category_for_char(ch);
+}
diff --git a/contrib/harfbuzz-unicode-tables.c b/contrib/harfbuzz-unicode-tables.c
new file mode 100644
index 0000000..3c3fead
--- /dev/null
+++ b/contrib/harfbuzz-unicode-tables.c
@@ -0,0 +1,84 @@
+#include <stdlib.h>
+#include <stdint.h>
+
+#include <harfbuzz-external.h>
+
+#include "tables/category-properties.h"
+#include "tables/combining-properties.h"
+
+HB_LineBreakClass
+HB_GetLineBreakClass(HB_UChar32 ch) {
+  abort();
+  return 0;
+}
+
+static int
+combining_property_cmp(const void *vkey, const void *vcandidate) {
+  const uint32_t key = (uint32_t) (intptr_t) vkey;
+  const struct combining_property *candidate = vcandidate;
+
+  if (key < candidate->range_start) {
+    return -1;
+  } else if (key > candidate->range_end) {
+    return 1;
+  } else {
+    return 0;
+  }
+}
+
+static int
+code_point_to_combining_class(HB_UChar32 cp) {
+  const void *vprop = bsearch((void *) (intptr_t) cp, combining_properties,
+                              combining_properties_count,
+                              sizeof(struct combining_property),
+                              combining_property_cmp);
+  if (!vprop)
+    return 0;
+
+  return ((const struct combining_property *) vprop)->klass;
+}
+
+int
+HB_GetUnicodeCharCombiningClass(HB_UChar32 ch) {
+  return code_point_to_combining_class(ch);
+  return 0;
+}
+
+static int
+category_property_cmp(const void *vkey, const void *vcandidate) {
+  const uint32_t key = (uint32_t) (intptr_t) vkey;
+  const struct category_property *candidate = vcandidate;
+
+  if (key < candidate->range_start) {
+    return -1;
+  } else if (key > candidate->range_end) {
+    return 1;
+  } else {
+    return 0;
+  }
+}
+
+static HB_CharCategory
+code_point_to_category(HB_UChar32 cp) {
+  const void *vprop = bsearch((void *) (intptr_t) cp, category_properties,
+                              category_properties_count,
+                              sizeof(struct category_property),
+                              category_property_cmp);
+  if (!vprop)
+    return HB_NoCategory;
+
+  return ((const struct category_property *) vprop)->category;
+}
+
+void
+HB_GetUnicodeCharProperties(HB_UChar32 ch,
+                            HB_CharCategory *category,
+                            int *combiningClass) {
+  *category = code_point_to_category(ch);
+  *combiningClass = code_point_to_combining_class(ch);
+}
+
+HB_CharCategory
+HB_GetUnicodeCharCategory(HB_UChar32 ch) {
+  return code_point_to_category(ch);
+}
diff --git a/contrib/harfbuzz-unicode.c b/contrib/harfbuzz-unicode.c
new file mode 100644
index 0000000..9b3c43e
--- /dev/null
+++ b/contrib/harfbuzz-unicode.c
@@ -0,0 +1,264 @@
+#include <stdint.h>
+#include <stdlib.h>
+
+#include <harfbuzz-external.h>
+#include <harfbuzz-impl.h>
+#include <harfbuzz-shaper.h>
+#include "harfbuzz-unicode.h"
+
+#include "tables/script-properties.h"
+#include "tables/grapheme-break-properties.h"
+
+uint32_t
+utf16_to_code_point(const uint16_t *chars, size_t len, ssize_t *iter) {
+  const uint16_t v = chars[(*iter)++];
+  if (HB_IsHighSurrogate(v)) {
+    // surrogate pair
+    if (*iter >= len) {
+      // the surrogate is incomplete.
+      return HB_InvalidCodePoint;
+    }
+    const uint16_t v2 = chars[(*iter)++];
+    if (!HB_IsLowSurrogate(v2)) {
+      // invalidate surrogate pair.
+      return HB_InvalidCodePoint;
+    }
+
+    return HB_SurrogateToUcs4(v, v2);
+  }
+
+  if (HB_IsLowSurrogate(v)) {
+    // this isn't a valid code point
+    return HB_InvalidCodePoint;
+  }
+
+  return v;
+}
+
+uint32_t
+utf16_to_code_point_prev(const uint16_t *chars, size_t len, ssize_t *iter) {
+  const uint16_t v = chars[(*iter)--];
+  if (HB_IsLowSurrogate(v)) {
+    // surrogate pair
+    if (*iter < 0) {
+      // the surrogate is incomplete.
+      return HB_InvalidCodePoint;
+    }
+    const uint16_t v2 = chars[(*iter)--];
+    if (!HB_IsHighSurrogate(v2)) {
+      // invalidate surrogate pair.
+      return HB_InvalidCodePoint;
+    }
+
+    return HB_SurrogateToUcs4(v2, v);
+  }
+
+  if (HB_IsHighSurrogate(v)) {
+    // this isn't a valid code point
+    return HB_InvalidCodePoint;
+  }
+
+  return v;
+}
+
+static int
+script_property_cmp(const void *vkey, const void *vcandidate) {
+  const uint32_t key = (uint32_t) (intptr_t) vkey;
+  const struct script_property *candidate = vcandidate;
+
+  if (key < candidate->range_start) {
+    return -1;
+  } else if (key > candidate->range_end) {
+    return 1;
+  } else {
+    return 0;
+  }
+}
+
+HB_Script
+code_point_to_script(uint32_t cp) {
+  const void *vprop = bsearch((void *) (intptr_t) cp, script_properties,
+                              script_properties_count,
+                              sizeof(struct script_property),
+                              script_property_cmp);
+  if (!vprop)
+    return HB_Script_Common;
+
+  return ((const struct script_property *) vprop)->script;
+}
+
+char
+hb_utf16_script_run_next(unsigned *num_code_points, HB_ScriptItem *output,
+                         const uint16_t *chars, size_t len, ssize_t *iter) {
+  if (*iter == len)
+    return 0;
+
+  output->pos = *iter;
+  const uint32_t init_cp = utf16_to_code_point(chars, len, iter);
+  unsigned cps = 1;
+  if (init_cp == HB_InvalidCodePoint)
+    return 0;
+  const HB_Script init_script = code_point_to_script(init_cp);
+  HB_Script current_script = init_script;
+  output->script = init_script;
+
+  for (;;) {
+    if (*iter == len)
+      break;
+    const ssize_t prev_iter = *iter;
+    const uint32_t cp = utf16_to_code_point(chars, len, iter);
+    if (cp == HB_InvalidCodePoint)
+      return 0;
+    cps++;
+    const HB_Script script = code_point_to_script(cp);
+
+    if (script != current_script) {
+      if (current_script == init_script == HB_Script_Inherited) {
+        // If we started off as inherited, we take whatever we can find.
+        output->script = script;
+        current_script = script;
+        continue;
+      } else if (script == HB_Script_Inherited) {
+        current_script = script;
+        continue;
+      } else {
+        *iter = prev_iter;
+        cps--;
+        break;
+      }
+    }
+  }
+
+  if (output->script == HB_Script_Inherited)
+    output->script = HB_Script_Common;
+
+  output->length = *iter - output->pos;
+  if (num_code_points)
+    *num_code_points = cps;
+  return 1;
+}
+
+char
+hb_utf16_script_run_prev(unsigned *num_code_points, HB_ScriptItem *output,
+                         const uint16_t *chars, size_t len, ssize_t *iter) {
+  if (*iter == (size_t) -1)
+    return 0;
+
+  const size_t ending_index = *iter;
+  const uint32_t init_cp = utf16_to_code_point_prev(chars, len, iter);
+  unsigned cps = 1;
+  if (init_cp == HB_InvalidCodePoint)
+    return 0;
+  const HB_Script init_script = code_point_to_script(init_cp);
+  HB_Script current_script = init_script;
+  output->script = init_script;
+
+  for (;;) {
+    if (*iter < 0)
+      break;
+    const ssize_t prev_iter = *iter;
+    const uint32_t cp = utf16_to_code_point_prev(chars, len, iter);
+    if (cp == HB_InvalidCodePoint)
+      return 0;
+    cps++;
+    const HB_Script script = code_point_to_script(cp);
+
+    if (script != current_script) {
+      if (current_script == init_script == HB_Script_Inherited) {
+        // If we started off as inherited, we take whatever we can find.
+        output->script = script;
+        current_script = script;
+        continue;
+      } else if (script == HB_Script_Inherited) {
+        current_script = script;
+        continue;
+      } else {
+        *iter = prev_iter;
+        cps--;
+        break;
+      }
+    }
+  }
+
+  if (output->script == HB_Script_Inherited)
+    output->script = HB_Script_Common;
+
+  output->pos = *iter + 1;
+  output->length = ending_index - *iter;
+  if (num_code_points)
+    *num_code_points = cps;
+  return 1;
+}
+
+static int
+grapheme_break_property_cmp(const void *vkey, const void *vcandidate) {
+  const uint32_t key = (uint32_t) (intptr_t) vkey;
+  const struct grapheme_break_property *candidate = vcandidate;
+
+  if (key < candidate->range_start) {
+    return -1;
+  } else if (key > candidate->range_end) {
+    return 1;
+  } else {
+    return 0;
+  }
+}
+
+HB_GraphemeClass
+HB_GetGraphemeClass(HB_UChar32 ch) {
+  const void *vprop = bsearch((void *) (intptr_t) ch, grapheme_break_properties,
+                              grapheme_break_properties_count,
+                              sizeof(struct grapheme_break_property),
+                              grapheme_break_property_cmp);
+  if (!vprop)
+    return HB_Grapheme_Other;
+
+  return ((const struct grapheme_break_property *) vprop)->klass;
+}
+
+HB_WordClass
+HB_GetWordClass(HB_UChar32 ch) {
+  abort();
+  return 0;
+}
+
+HB_SentenceClass
+HB_GetSentenceClass(HB_UChar32 ch) {
+  abort();
+  return 0;
+}
+
+void
+HB_GetGraphemeAndLineBreakClass(HB_UChar32 ch, HB_GraphemeClass *gclass, HB_LineBreakClass *breakclass) {
+  *gclass = HB_GetGraphemeClass(ch);
+  *breakclass = HB_GetLineBreakClass(ch);
+}
+
+HB_UChar16
+HB_GetMirroredChar(HB_UChar16 ch) {
+  abort();
+  return 0;
+}
+
+void *
+HB_Library_Resolve(const char *library, const char *symbol) {
+  abort();
+  return NULL;
+}
+
+void *
+HB_TextCodecForMib(int mib) {
+  abort();
+  return NULL;
+}
+
+char *
+HB_TextCodec_ConvertFromUnicode(void *codec, const HB_UChar16 *unicode, hb_uint32 length, hb_uint32 *outputLength) {
+  abort();
+  return NULL;
+}
+
+void
+HB_TextCodec_FreeResult(char *v) {
+  abort();
+}
diff --git a/contrib/harfbuzz-unicode.h b/contrib/harfbuzz-unicode.h
new file mode 100644
index 0000000..f28b3c3
--- /dev/null
+++ b/contrib/harfbuzz-unicode.h
@@ -0,0 +1,54 @@
+#ifndef SCRIPT_IDENTIFY_H_
+#define SCRIPT_IDENTIFY_H_
+
+#include <stdint.h>
+
+#include <harfbuzz-shaper.h>
+
+static const uint32_t HB_InvalidCodePoint = 0xffffffffu;
+
+// -----------------------------------------------------------------------------
+// Return the next Unicode code point from a UTF-16 vector
+//   chars: a pointer to @len words
+//   iter: (input/output) an index into @chars. This is updated.
+//   returns: HB_InvalidCodePoint on error and the code point otherwise.
+// -----------------------------------------------------------------------------
+uint32_t utf16_to_code_point(const uint16_t *chars, size_t len, ssize_t *iter);
+
+// -----------------------------------------------------------------------------
+// Like the above, except that the code points are traversed backwards. Thus,
+// on the first call, |iter| should be |len| - 1.
+// -----------------------------------------------------------------------------
+uint32_t utf16_to_code_point(const uint16_t *chars, size_t len, ssize_t *iter);
+
+// -----------------------------------------------------------------------------
+// Return the script of the given code point
+// -----------------------------------------------------------------------------
+HB_Script code_point_to_script(uint32_t cp);
+
+// -----------------------------------------------------------------------------
+// Find the next script run in a UTF-16 string.
+//
+// A script run is a subvector of codepoints, all of which are in the same
+// script. A run will never cut a surrogate pair in half at either end.
+//
+// num_code_points: (output, maybe NULL) the number of code points in the run
+// output: (output) the @pos, @length and @script fields are set on success
+// chars: the UTF-16 string
+// len: the length of @chars, in words
+// iter: (in/out) the current index into the string. This should be 0 for the
+//   first call and is updated on exit.
+//
+// returns: non-zero if a script run was found and returned.
+// -----------------------------------------------------------------------------
+char hb_utf16_script_run_next(unsigned *num_code_points, HB_ScriptItem *output,
+                              const uint16_t *chars, size_t len, ssize_t *iter);
+
+// -----------------------------------------------------------------------------
+// This is the same as above, except that the input is traversed backwards.
+// Thus, on the first call, |iter| should be |len| - 1.
+// -----------------------------------------------------------------------------
+char hb_utf16_script_run_prev(unsigned *num_code_points, HB_ScriptItem *output,
+                              const uint16_t *chars, size_t len, ssize_t *iter);
+
+#endif
diff --git a/contrib/tables/README b/contrib/tables/README
new file mode 100644
index 0000000..605d1c0
--- /dev/null
+++ b/contrib/tables/README
@@ -0,0 +1,17 @@
+This directory contains Python script to parse several of the Unicode tables
+that are downloadable from the web and generate C header files from them.
+
+These are the locations of the files which are parsed. You should download these
+files and put them in this directory.
+
+http://www.unicode.org/Public/5.1.0/ucd/extracted/DerivedGeneralCategory.txt
+http://www.unicode.org/Public/5.1.0/ucd/extracted/DerivedCombiningClass.txt
+http://www.unicode.org/Public/UNIDATA/auxiliary/GraphemeBreakProperty.txt
+http://www.unicode.org/Public/5.1.0/ucd/Scripts.txt
+
+Then you can run the following python scripts to generate the header files:
+
+python category-parse.py DerivedGeneralCategory.txt category-properties.h
+python combining-class-parse.py DerivedCombiningClass.txt combining-properties.h
+python grapheme-break-parse.py GraphemeBreakProperty.txt grapheme-break-properties.h
+python scripts-parse.py Scripts.txt script-properties.h
diff --git a/contrib/tables/category-parse.py b/contrib/tables/category-parse.py
new file mode 100644
index 0000000..6818c1d
--- /dev/null
+++ b/contrib/tables/category-parse.py
@@ -0,0 +1,70 @@
+import sys
+from unicode_parse_common import *
+
+# http://www.unicode.org/Public/5.1.0/ucd/extracted/DerivedGeneralCategory.txt
+
+category_to_harfbuzz = {
+  'Mn': 'HB_Mark_NonSpacing',
+  'Mc': 'HB_Mark_SpacingCombining',
+  'Me': 'HB_Mark_Enclosing',
+
+  'Nd': 'HB_Number_DecimalDigit',
+  'Nl': 'HB_Number_Letter',
+  'No': 'HB_Number_Other',
+
+  'Zs': 'HB_Separator_Space',
+  'Zl': 'HB_Separator_Line',
+  'Zp': 'HB_Separator_Paragraph',
+
+  'Cc': 'HB_Other_Control',
+  'Cf': 'HB_Other_Format',
+  'Cs': 'HB_Other_Surrogate',
+  'Co': 'HB_Other_PrivateUse',
+  'Cn': 'HB_Other_NotAssigned',
+
+  'Lu': 'HB_Letter_Uppercase',
+  'Ll': 'HB_Letter_Lowercase',
+  'Lt': 'HB_Letter_Titlecase',
+  'Lm': 'HB_Letter_Modifier',
+  'Lo': 'HB_Letter_Other',
+
+  'Pc': 'HB_Punctuation_Connector',
+  'Pd': 'HB_Punctuation_Dash',
+  'Ps': 'HB_Punctuation_Open',
+  'Pe': 'HB_Punctuation_Close',
+  'Pi': 'HB_Punctuation_InitialQuote',
+  'Pf': 'HB_Punctuation_FinalQuote',
+  'Po': 'HB_Punctuation_Other',
+
+  'Sm': 'HB_Symbol_Math',
+  'Sc': 'HB_Symbol_Currency',
+  'Sk': 'HB_Symbol_Modifier',
+  'So': 'HB_Symbol_Other',
+}
+
+def main(infile, outfile):
+  ranges = unicode_file_parse(infile, category_to_harfbuzz)
+  ranges = sort_and_merge(ranges)
+
+  print >>outfile, '// Generated from Unicode script tables\n'
+  print >>outfile, '#ifndef CATEGORY_PROPERTIES_H_'
+  print >>outfile, '#define CATEGORY_PROPERTIES_H_\n'
+  print >>outfile, '#include <stdint.h>'
+  print >>outfile, '#include "harfbuzz-external.h"\n'
+  print >>outfile, 'struct category_property {'
+  print >>outfile, '  uint32_t range_start;'
+  print >>outfile, '  uint32_t range_end;'
+  print >>outfile, '  HB_CharCategory category;'
+  print >>outfile, '};\n'
+  print >>outfile, 'static const struct category_property category_properties[] = {'
+  for (start, end, value) in ranges:
+    print >>outfile, '  {0x%x, 0x%x, %s},' % (start, end, value)
+  print >>outfile, '};\n'
+  print >>outfile, 'static const unsigned category_properties_count = %d;\n' % len(ranges)
+  print >>outfile, '#endif  // CATEGORY_PROPERTIES_H_'
+
+if __name__ == '__main__':
+  if len(sys.argv) != 3:
+    print 'Usage: %s <input .txt> <output .h>' % sys.argv[0]
+  else:
+    main(file(sys.argv[1], 'r'), file(sys.argv[2], 'w+'))
diff --git a/contrib/tables/combining-class-parse.py b/contrib/tables/combining-class-parse.py
new file mode 100644
index 0000000..c591ddd
--- /dev/null
+++ b/contrib/tables/combining-class-parse.py
@@ -0,0 +1,34 @@
+import sys
+from unicode_parse_common import *
+
+# http://www.unicode.org/Public/5.1.0/ucd/extracted/DerivedCombiningClass.txt
+
+class IdentityMap(object):
+  def __getitem__(_, key):
+    return key
+
+def main(infile, outfile):
+  ranges = unicode_file_parse(infile, IdentityMap(), '0')
+  ranges = sort_and_merge(ranges)
+
+  print >>outfile, '// Generated from Unicode tables\n'
+  print >>outfile, '#ifndef COMBINING_PROPERTIES_H_'
+  print >>outfile, '#define COMBINING_PROPERTIES_H_\n'
+  print >>outfile, '#include <stdint.h>'
+  print >>outfile, 'struct combining_property {'
+  print >>outfile, '  uint32_t range_start;'
+  print >>outfile, '  uint32_t range_end;'
+  print >>outfile, '  uint8_t klass;'
+  print >>outfile, '};\n'
+  print >>outfile, 'static const struct combining_property combining_properties[] = {'
+  for (start, end, value) in ranges:
+    print >>outfile, '  {0x%x, 0x%x, %s},' % (start, end, value)
+  print >>outfile, '};\n'
+  print >>outfile, 'static const unsigned combining_properties_count = %d;\n' % len(ranges)
+  print >>outfile, '#endif  // COMBINING_PROPERTIES_H_'
+
+if __name__ == '__main__':
+  if len(sys.argv) != 3:
+    print 'Usage: %s <input .txt> <output .h>' % sys.argv[0]
+  else:
+    main(file(sys.argv[1], 'r'), file(sys.argv[2], 'w+'))
diff --git a/contrib/tables/grapheme-break-parse.py b/contrib/tables/grapheme-break-parse.py
new file mode 100644
index 0000000..a4b3534
--- /dev/null
+++ b/contrib/tables/grapheme-break-parse.py
@@ -0,0 +1,45 @@
+import sys
+from unicode_parse_common import *
+
+# http://www.unicode.org/Public/UNIDATA/auxiliary/GraphemeBreakProperty.txt
+
+property_to_harfbuzz = {
+  'CR': 'HB_Grapheme_CR',
+  'LF': 'HB_Grapheme_LF',
+  'Control': 'HB_Grapheme_Control',
+  'Extend': 'HB_Grapheme_Extend',
+  'Prepend': 'HB_Grapheme_Other',
+  'SpacingMark': 'HB_Grapheme_Other',
+  'L': 'HB_Grapheme_L',
+  'V': 'HB_Grapheme_V',
+  'T': 'HB_Grapheme_T',
+  'LV': 'HB_Grapheme_LV',
+  'LVT': 'HB_Grapheme_LVT',
+}
+
+def main(infile, outfile):
+  ranges = unicode_file_parse(infile, property_to_harfbuzz)
+  ranges.sort()
+
+  print >>outfile, '// Generated from Unicode Grapheme break tables\n'
+  print >>outfile, '#ifndef GRAPHEME_BREAK_PROPERTY_H_'
+  print >>outfile, '#define GRAPHEME_BREAK_PROPERTY_H_\n'
+  print >>outfile, '#include <stdint.h>'
+  print >>outfile, '#include "harfbuzz-external.h"\n'
+  print >>outfile, 'struct grapheme_break_property {'
+  print >>outfile, '  uint32_t range_start;'
+  print >>outfile, '  uint32_t range_end;'
+  print >>outfile, '  HB_GraphemeClass klass;'
+  print >>outfile, '};\n'
+  print >>outfile, 'static const struct grapheme_break_property grapheme_break_properties[] = {'
+  for (start, end, value) in ranges:
+    print >>outfile, '  {0x%x, 0x%x, %s},' % (start, end, value)
+  print >>outfile, '};\n'
+  print >>outfile, 'static const unsigned grapheme_break_properties_count = %d;\n' % len(ranges)
+  print >>outfile, '#endif  // GRAPHEME_BREAK_PROPERTY_H_'
+
+if __name__ == '__main__':
+  if len(sys.argv) != 3:
+    print 'Usage: %s <input .txt> <output .h>' % sys.argv[0]
+  else:
+    main(file(sys.argv[1], 'r'), file(sys.argv[2], 'w+'))
diff --git a/contrib/tables/scripts-parse.py b/contrib/tables/scripts-parse.py
new file mode 100644
index 0000000..23bac10
--- /dev/null
+++ b/contrib/tables/scripts-parse.py
@@ -0,0 +1,75 @@
+import sys
+from unicode_parse_common import *
+
+# http://www.unicode.org/Public/5.1.0/ucd/Scripts.txt
+
+script_to_harfbuzz = {
+  # This is the list of HB_Script_* at the time of writing
+  'Common': 'HB_Script_Common',
+  'Greek': 'HB_Script_Greek',
+  'Cyrillic': 'HB_Script_Cyrillic',
+  'Armenian': 'HB_Script_Armenian',
+  'Hebrew': 'HB_Script_Hebrew',
+  'Arabic': 'HB_Script_Arabic',
+  'Syriac': 'HB_Script_Syriac',
+  'Thaana': 'HB_Script_Thaana',
+  'Devanagari': 'HB_Script_Devanagari',
+  'Bengali': 'HB_Script_Bengali',
+  'Gurmukhi': 'HB_Script_Gurmukhi',
+  'Gujarati': 'HB_Script_Gujarati',
+  'Oriya': 'HB_Script_Oriya',
+  'Tamil': 'HB_Script_Tamil',
+  'Telugu': 'HB_Script_Telugu',
+  'Kannada': 'HB_Script_Kannada',
+  'Malayalam': 'HB_Script_Malayalam',
+  'Sinhala': 'HB_Script_Sinhala',
+  'Thai': 'HB_Script_Thai',
+  'Lao': 'HB_Script_Lao',
+  'Tibetan': 'HB_Script_Tibetan',
+  'Myanmar': 'HB_Script_Myanmar',
+  'Georgian': 'HB_Script_Georgian',
+  'Hangul': 'HB_Script_Hangul',
+  'Ogham': 'HB_Script_Ogham',
+  'Runic': 'HB_Script_Runic',
+  'Khmer': 'HB_Script_Khmer',
+  'Inherited': 'HB_Script_Inherited',
+}
+
+class ScriptDict(object):
+  def __init__(self, base):
+    self.base = base
+
+  def __getitem__(self, key):
+    r = self.base.get(key, None)
+    if r is None:
+      return 'HB_Script_Common'
+    return r
+
+def main(infile, outfile):
+  ranges = unicode_file_parse(infile,
+                              ScriptDict(script_to_harfbuzz),
+                              'HB_Script_Common')
+  ranges = sort_and_merge(ranges)
+
+  print >>outfile, '// Generated from Unicode script tables\n'
+  print >>outfile, '#ifndef SCRIPT_PROPERTIES_H_'
+  print >>outfile, '#define SCRIPT_PROPERTIES_H_\n'
+  print >>outfile, '#include <stdint.h>'
+  print >>outfile, '#include "harfbuzz-shaper.h"\n'
+  print >>outfile, 'struct script_property {'
+  print >>outfile, '  uint32_t range_start;'
+  print >>outfile, '  uint32_t range_end;'
+  print >>outfile, '  HB_Script script;'
+  print >>outfile, '};\n'
+  print >>outfile, 'static const struct script_property script_properties[] = {'
+  for (start, end, value) in ranges:
+    print >>outfile, '  {0x%x, 0x%x, %s},' % (start, end, value)
+  print >>outfile, '};\n'
+  print >>outfile, 'static const unsigned script_properties_count = %d;\n' % len(ranges)
+  print >>outfile, '#endif  // SCRIPT_PROPERTIES_H_'
+
+if __name__ == '__main__':
+  if len(sys.argv) != 3:
+    print 'Usage: %s <input .txt> <output .h>' % sys.argv[0]
+  else:
+    main(file(sys.argv[1], 'r'), file(sys.argv[2], 'w+'))
diff --git a/contrib/tables/unicode_parse_common.py b/contrib/tables/unicode_parse_common.py
new file mode 100644
index 0000000..ac26eca
--- /dev/null
+++ b/contrib/tables/unicode_parse_common.py
@@ -0,0 +1,70 @@
+def lines_get(f):
+  '''Parse a file like object, removing comments and returning a list of
+     lines.'''
+  def cut_comment(line):
+    first_hash = line.find('#')
+    if first_hash == -1:
+      return line
+    return line[:first_hash]
+
+  return [x for x in [cut_comment(x[:-1]) for x in f.readlines()] if len(x)]
+
+def line_split(line):
+  '''Split a line based on a semicolon separator.'''
+  def normalise(word):
+    return word.lstrip().rstrip()
+  return [normalise(x) for x in line.split(';')]
+
+def codepoints_parse(token):
+  '''Parse a Unicode style code-point range. Return either a single value or a
+     tuple of (start, end) for a range of code-points.'''
+  def fromHex(token):
+    return int(token, 16)
+  parts = token.split('..')
+  if len(parts) == 2:
+    return (fromHex(parts[0]), fromHex(parts[1]))
+  elif len(parts) == 1:
+    return fromHex(parts[0])
+  else:
+    raise ValueError(token)
+
+def unicode_file_parse(input, map, default_value = None):
+  '''Parse a file like object, @input where the first column is a code-point
+     range and the second column is mapped via the given dict, @map.'''
+  ranges = []
+  tokens = [line_split(x) for x in lines_get(input)]
+  for line in tokens:
+    if len(line) == 2:
+      codepoints = codepoints_parse(line[0])
+      value = map[line[1]]
+      if value == default_value:
+        continue
+
+      if type(codepoints) == int:
+        codepoints = (codepoints, codepoints)
+
+      ranges.append((codepoints[0], codepoints[1], value))
+    else:
+      raise ValueError(line)
+
+  return ranges
+
+def sort_and_merge(ranges):
+  '''Given a list of (start, end, value), merge elements where the ranges are
+     continuous and the values are the same.'''
+  output = []
+  ranges.sort()
+  current = None
+  for v in ranges:
+    if current is None:
+      current = v
+      continue
+    if current[1] + 1 == v[0] and current[2] == v[2]:
+      current = (current[0], v[1], v[2])
+    else:
+      output.append(current)
+      current = v
+  if current is not None:
+    output.append(current)
+
+  return output