[poppler] Abiword Poppler Backend

Jeff Muizelaar jeff at infidigm.net
Tue Apr 3 08:51:04 PDT 2007


Included below is Jauco's work from SoC 2006. It doesn't modify any
existing poppler code so I don't see any reason not to merge it. Does
anyone disagree?

-Jeff

diff -urN -x CVS poppler-abiword-no/configure.ac poppler-abiword/configure.ac
--- poppler-abiword-no/configure.ac	2007-04-01 14:30:01.256637032 -0400
+++ poppler-abiword/configure.ac	2007-03-31 11:34:08.000000000 -0400
@@ -251,6 +251,9 @@
 fi
 AM_CONDITIONAL(BUILD_GTK_TEST, test x$enable_gtk_test = xyes)
 
+PKG_CHECK_MODULES(LIBXML, libxml-2.0)
+AC_SUBST(LIBXML_LIBS)
+AC_SUBST(LIBXML_CFLAGS)
 
 AC_ARG_ENABLE(utils,
               AC_HELP_STRING([--disable-utils],
diff -urN -x CVS poppler-abiword-no/poppler/ABWOutputDev.cc poppler-abiword/poppler/ABWOutputDev.cc
--- poppler-abiword-no/poppler/ABWOutputDev.cc	1969-12-31 19:00:00.000000000 -0500
+++ poppler-abiword/poppler/ABWOutputDev.cc	2007-03-31 13:31:20.000000000 -0400
@@ -0,0 +1,1286 @@
+//========================================================================
+//
+// ABWOutputDev.cc
+//
+// Jauco Noordzij
+//
+// Based somewhat on HtmlOutputDev.cc
+//
+//========================================================================
+
+#ifdef __GNUC__
+#pragma implementation
+#endif
+
+#include "config.h"
+#include <stdio.h>
+#include <stdlib.h>
+#include <stdarg.h>
+#include <stddef.h>
+#include <ctype.h>
+#include <math.h>
+#include "goo/GooString.h"
+#include "goo/GooList.h"
+#include "UnicodeMap.h"
+#include "goo/gmem.h"
+#include "Error.h"
+#include "GfxState.h"
+#include "GlobalParams.h"
+#include "ABWOutputDev.h"
+#include "UGooString.h"
+#include "PDFDoc.h"
+
+#include <libxml/parser.h>
+#include <libxml/tree.h>
+#include <libxml/xpath.h>
+#include <libxml/xpathInternals.h>
+#include <libxml/debugXML.h>
+
+
+// Inter-character space width which will cause addChar to start a new
+// word.
+#define minWordBreakSpace 0.1
+
+// Maximum inter-word spacing, as a fraction of the font size.
+#define maxWordSpacing 1.5
+
+// Max distance between baselines of two lines within a block, as a
+// fraction of the font size.
+#define maxLineSpacingDelta 1.5
+
+#define C_maxVCutValue 4
+#define C_maxHCutValue 5
+//------------------------------------------------------------------------
+// ABWOutputDev
+//------------------------------------------------------------------------
+
+ABWOutputDev::ABWOutputDev(xmlDocPtr ext_doc)
+{
+  pdfdoc = NULL;
+  N_page = N_style = N_text = N_styleset = N_Block = N_word = NULL;
+  doc = ext_doc;
+  N_root = xmlNewNode(NULL, BAD_CAST "abiword");
+  xmlDocSetRootElement(doc, N_root);
+  N_styleset = xmlNewChild(N_root, NULL, BAD_CAST "styles", NULL);
+  N_content = xmlNewChild(N_root, NULL, BAD_CAST "content", NULL);
+  uMap = globalParams->getTextEncoding();
+  maxStyle = Style = 1;
+}
+
+ABWOutputDev::~ABWOutputDev() {
+  xmlCleanupParser();
+}
+
+void ABWOutputDev::startPage(int pageNum, GfxState *state) {
+  /*While reading a pdf page this node acts as a placeholder parent.
+  when conversion is finished and the page is structured as we like it
+  all text fragments are moved from N_page to N_content.*/
+  N_page = xmlNewNode(NULL, BAD_CAST "page");
+  G_pageNum = pageNum;
+} 
+
+/*Callback to denote that poppler reached the end of a page
+here I insert most of the interesting processing stuff*/
+void ABWOutputDev::endPage() {
+  //make sure all words are closed
+  endTextBlock();
+  cleanUpNode(N_page, true);
+  //xmlAddChild(N_content, N_page);
+  //xmlSaveFormatFileEnc("pre-cut.xml", doc, "UTF-8", 1);
+  //xmlUnlinkNode(N_page);
+  //call the top down cutting mechanism
+  recursiveXYC(N_page);
+  //by stopping to worry about creating empty nodes I made the code quite a 
+  //bit more robust. This function makes sure we have a nice'n'clean tree
+  cleanUpNode(N_page, true);
+  //xmlAddChild(N_content, N_page);
+  //xmlSaveFormatFileEnc("raw.xml", doc, "UTF-8", 1);
+  //xmlUnlinkNode(N_page);
+  
+  //Interpret the XY tree and infer text blocks and columns
+  interpretXYTree();
+  cleanUpNode(N_page, true);
+  //xmlAddChild(N_content, N_page);
+  //xmlSaveFormatFileEnc("interpreted.xml", doc, "UTF-8", 1);
+  //xmlUnlinkNode(N_page);
+  
+  //I have blocks and columns, this function will turn that into paragraphs and
+  //columns
+  generateParagraphs();
+  cleanUpNode(N_page, true);
+  //xmlAddChild(N_content, N_page);
+  //xmlSaveFormatFileEnc("paragraphs.xml", doc, "UTF-8", 1);
+  
+  xmlAddChild(N_content, N_page);
+  //just for cleanliness
+  N_page = NULL;
+}
+
+void ABWOutputDev::recursiveXYC(xmlNodePtr nodeset) {
+  /*This function implements the recursive XY Cut. basically, it gets
+  the largest piece of whitespace (using getBiggestSeperator()) and then
+  splits the page using splitNodes on that whitespace. It calls itself again
+  with both the halves*/
+  float bhs, bvs, X1, X2, Y1, Y2;
+  xmlNodePtr N_cur;
+
+  bvs = getBiggestSeperator(nodeset, VERTICAL, &X1, &X2);
+  bhs = getBiggestSeperator(nodeset, HORIZONTAL, &Y1, &Y2);
+  
+  //printf("***\nbetween %f and %f there is a vertical seperation of %f\n",X1,X2,bvs);
+  //printf("between %f and %f there is a horizontal seperation of %f\n",Y1,Y2,bhs);
+  
+  
+  if ((bvs == -1) and (bhs > -1)){
+    //printf("Make a horizontal cut!\n");
+    splitNodes(Y1, HORIZONTAL, nodeset, bhs);
+  }
+  else {
+    if ((bvs > -1) and (bhs == -1)){
+      //printf("Make a vertical cut!\n");
+      splitNodes(X1, VERTICAL, nodeset, bvs);
+    }
+    else {
+      if ((bvs > -1) and (bhs > -1)){
+        if (bvs >= (bhs/1.7)){
+          //When people read a text they prefer vertical cuts over horizontal 
+          //ones. I'm not that sure about the 1.7 value, but it seems to work.
+          //printf("Make a vertical cut!\n");
+          splitNodes(X1, VERTICAL, nodeset, bvs);
+        }
+        else {
+        //printf("Make a horizontal cut!\n");
+          splitNodes(Y1, HORIZONTAL, nodeset, bhs);
+        }
+      }
+    }
+  }
+  if (not((bvs == -1) and (bhs == -1))){
+      recursiveXYC(nodeset->children);
+      recursiveXYC(nodeset->children->next);
+  }
+}
+
+void ABWOutputDev::splitNodes(float splitValue, unsigned int direction, xmlNodePtr N_parent, double extravalue){
+  //This function takes a nodeset and splits it based on a cut value. It returns
+  //the nodePtr with two childnodes, the both chunks.
+  xmlNodePtr N_move, N_cur, N_newH, N_newL;
+  char * propName;
+  const char *nodeName;
+  char buf[20];
+  if (direction == HORIZONTAL) {propName = "Y1"; nodeName = "horizontal";}
+  else { propName = "X1"; nodeName = "vertical";}
+  N_newH = xmlNewNode(NULL, BAD_CAST nodeName);
+  N_newL = xmlNewNode(NULL, BAD_CAST nodeName);
+  sprintf(buf, "%f", extravalue); xmlNewProp(N_newH, BAD_CAST "diff", BAD_CAST buf);
+  sprintf(buf, "%f", extravalue); xmlNewProp(N_newL, BAD_CAST "diff", BAD_CAST buf);
+  N_cur = N_parent->children;
+  //only do this if there are at least two child nodes
+  if (direction == HORIZONTAL) {propName = "Y1";}
+  else { propName = "X1"; }
+  while (N_cur){
+    N_move = N_cur->next;
+    if (xmlXPathCastStringToNumber(xmlGetProp(N_cur,BAD_CAST propName)) > splitValue){
+      xmlUnlinkNode(N_cur);
+      xmlAddChild(N_newH, N_cur);
+    }
+    else {
+      xmlUnlinkNode(N_cur);
+      xmlAddChild(N_newL, N_cur);
+    }
+    N_cur = N_move;
+  }
+  xmlAddChild(N_parent, N_newL);
+  xmlAddChild(N_parent, N_newH);
+}
+
+/*
+This function gets the biggest whitespace in a portion of the document.
+It does so by creating an array of begin and end points of non-whitespace
+ex: 132,180,200,279,280,600
+whitespace ^       ^
+however, to get such an array working nicely required quite a bit of hackish:
+"let's predict every possible eventuality and describe appropriate actions for 
+it."
+Better ideas for managing this information are welcomed.
+*/
+float ABWOutputDev::getBiggestSeperator(xmlNodePtr N_set, unsigned int direction, float * C1, float * C2)
+{
+  char * buf;
+  float curC1, curC2, retVal;
+  xmlNodePtr N_cur;
+  unsigned int borders_size = 0;
+  unsigned int j;
+  float * borders;
+  borders = new float[borders_size];
+  
+  for (N_cur = N_set->children; N_cur; N_cur = N_cur->next){
+    if (direction == VERTICAL){
+      curC1 = xmlXPathCastStringToNumber(xmlGetProp(N_cur,BAD_CAST "X1"));
+      curC2 = xmlXPathCastStringToNumber(xmlGetProp(N_cur,BAD_CAST "X2"));
+    }
+    else {
+      curC1 = xmlXPathCastStringToNumber(xmlGetProp(N_cur,BAD_CAST "Y1"));
+      curC2 = xmlXPathCastStringToNumber(xmlGetProp(N_cur,BAD_CAST "Y2"));
+    }
+    /*printf("borders_size = %d, curC1 = %f, curC2 = %f\n",borders_size,curC1,curC2);
+    for (unsigned int i = 0; i<borders_size; i++){
+      printf("%f, ",borders[i]);
+    }
+    printf("\n");*/
+    j=0;
+    /*
+    we skip to the array range by range. At any time borders[j] is the beginning
+    of the range, borders[j+1] is the end. borders[j+2] is the beginning of the 
+    next one etc.
+    */
+    while (j < borders_size) {
+      if (curC1 < borders[j]) {
+        //printf("  110:curC1 < borders[j] (%f)\n",borders[j]);
+        if (curC2 < borders[j]) {
+          //printf("    112:curC2 < borders[j] (%f)\n",borders[j]);
+          //printf("***insert before %d\n",j);
+          float * borders_new = new float[borders_size+2]; 
+          for (unsigned int n = 0; n < borders_size; n++ ) {
+            if (n <j){
+              borders_new[n] = borders[n];
+            }
+            else {
+              borders_new[n+2] = borders[n];
+            }
+          }
+          delete [] borders;
+          borders = borders_new;
+          borders[j]   = curC1;
+          borders[j+1] = curC2;
+          borders_size = borders_size+2;
+          break;
+        }
+        else {
+          //printf("  130:curC1 >= borders[j] (%f)\n",borders[j]);
+          if (curC2 <= borders[j+1]){
+            //printf("***replace %f with %f\n",borders[j], curC1);
+            //printf("    132:curC2 <= borders[j+1] (%f)\n",borders[j+1]);
+            borders[j] = curC1;
+            break;
+          }
+          else {
+            if (curC2 >= borders[j+2]){
+              //printf("***replace larger ones.\n");
+              //printf("        155:curC2 >= borders[j+2](%f)\n",borders[j+2]);
+              //At this point we have a range whose right coordinate is higher
+              //then the left coordinate of the following range. ie. the new one
+              //overlaps. This code removes all the elements it overlaps, and
+              //inserts the new end-marker
+              unsigned int c = 0;
+              //first, look for how much array elements need to be removed
+              //starting fro the right-hand coordinate of the current range.
+              for (unsigned int n = j+2; n < borders_size; n++) {
+                if (curC2 < borders[n]){
+                  break;
+                }
+                c++;
+              }
+              //printf("end: c = %d\n",c);
+              //Remove the intermediate ones. insert the curC2
+              unsigned int newSize;
+              if (c %2 ==0){
+                newSize = borders_size-c;
+              }
+              else {
+                newSize = borders_size-c-1;
+              }
+              float * borders_new = new float[newSize];
+              for (unsigned int n = 0; n < borders_size; n++ ) {
+                if (n <= j){
+                  //printf("*copying: old: n = %d, new: n = %d, value = %f\n",n,n,borders[n]);
+                  borders_new[n] = borders[n];
+                }
+                else {
+                  if (c %2 ==0){
+                    if (n == j+c+1){
+                     //printf("*new value: new: n = %d, value = %f\n",n-c),curC2);
+                     borders_new[n-c] = curC2;
+                    }
+                    else {
+                      if (n>j+c+1) {
+                        //printf("*copy after: old: n = %d, new: n = %d, value = %f\n",n,n-c,borders[n]);
+                        borders_new[n-c] = borders[n];
+                      }
+                    }
+                  }
+                  else {
+                    if (n>j+c+1) {
+                    //printf("*copy after: old: n = %d, new: n = %d, value = %f\n",n,n-c,borders[n]);
+                      borders_new[n-c-1] = borders[n];
+                    }
+                  }
+                }
+              }
+              delete [] borders;
+              borders = borders_new;
+              borders_size = newSize;
+              break;
+            }
+            else {
+              //printf("    137:curC2 > borders[j+1] (%f)\n",borders[j+1]);
+              //printf("***replace %f with %f and %f with %f\n",borders[j], curC1, borders[j+1], curC2);
+              borders[j] = curC1;
+              borders[j+1] = curC2;
+              break;
+            }
+          }
+        }
+      }
+      else {
+        //printf("  145:curC1 >= borders[j] (%f)\n",borders[j]);
+        if (curC1 <= borders[j+1]) {
+          //printf("    147:curC1 < borders[j+1] (%f)\n",borders[j]);
+          if (curC2 <= borders[j+1]) {
+            //printf("***ignore.\n");
+            //printf("      149:curC2 <= borders[j+1] (%f)\n",borders[j+1]);
+            break;
+          }
+          else {
+            //printf("      153:curC2 > borders[j+1] (%f)\n",borders[j+1]);
+            if (curC2 >= borders[j+2]){
+              //printf("***replace larger ones.\n");
+              //printf("        155:curC2 >= borders[j+2](%f)\n",borders[j+2]);
+              //At this point we have a range whose right coordinate is higher
+              //then the left coordinate of the following range. ie. the new one
+              //overlaps. This code removes all the elements it overlaps, and
+              //inserts the new end-marker
+              unsigned int c = 0;
+              //first, look for how much array elements need to be removed
+              //starting fro the right-hand coordinate of the current range.
+              for (unsigned int n = j+2; n < borders_size; n++) {
+                if (curC2 < borders[n]){
+                  break;
+                }
+                c++;
+              }
+              //printf("end: c = %d\n",c);
+              //Remove the intermediate ones. insert the curC2
+              unsigned int newSize;
+              if (c %2 ==0){
+                newSize = borders_size-c;
+              }
+              else {
+                newSize = borders_size-c-1;
+              }
+              float * borders_new = new float[newSize];
+              for (unsigned int n = 0; n < borders_size; n++ ) {
+                if (n <= j){
+                  //printf("*copying: old: n = %d, new: n = %d, value = %f\n",n,n,borders[n]);
+                  borders_new[n] = borders[n];
+                }
+                else {
+                  if (c %2 ==0){
+                    if (n == j+c+1){
+                     //printf("*new value: new: n = %d, value = %f\n",n-c),curC2);
+                     borders_new[n-c] = curC2;
+                    }
+                    else {
+                      if (n>j+c+1) {
+                        //printf("*copy after: old: n = %d, new: n = %d, value = %f\n",n,n-c,borders[n]);
+                        borders_new[n-c] = borders[n];
+                      }
+                    }
+                  }
+                  else {
+                    if (n>j+c+1) {
+                    //printf("*copy after: old: n = %d, new: n = %d, value = %f\n",n,n-c,borders[n]);
+                      borders_new[n-c-1] = borders[n];
+                    }
+                  }
+                }
+              }
+              delete [] borders;
+              borders = borders_new;
+              borders_size = newSize;
+              break;
+            }
+            else {
+              //printf("***replace %f with %f\n", borders[j+1], curC2);
+              borders[j+1] = curC2;
+              break;
+            }
+          }
+        }
+        else {
+          //printf("    159:curC1 >= borders[j+1] (%f)\n",borders[j]);
+          j += 2;
+        }
+      }
+    }
+    if (j >= borders_size){
+      //printf("    165:appending curC1 and curC2 to borders\n");
+      float * borders_new = new float[borders_size+2]; 
+      for (unsigned int n = 0; n < borders_size; n++ ) {
+        borders_new[n] = borders[n];
+      }
+      delete [] borders;
+      borders = borders_new;
+      borders[borders_size]   = curC1;
+      borders[borders_size+1] = curC2;
+      borders_size = borders_size+2;
+    }
+  }
+  retVal = -1;
+  //printf("%f, ",borders[0]);
+  for (unsigned int i = 2; i<borders_size; i+=2){
+    //printf("%f, %f, ",borders[i-1],borders[i]);
+    if (((borders[i]-borders[i-1]) - retVal) > 0.5){
+      retVal = borders[i]-borders[i-1];
+      *C1 = borders[i-1];
+      *C2 = borders[i];
+    }
+    //printf("between %f and %f there is a seperation of %f\n",borders[i-1],borders[i],borders[i]-borders[i-1]);
+  }
+  //printf("\n");
+  delete [] borders;
+  //Arbitrary cut-of values
+/*  if (
+      (direction == HORIZONTAL and retVal < C_maxHCutValue) or 
+      (direction == VERTICAL and retVal < C_maxVCutValue)
+     ) 
+  {
+    retVal = -1;
+  }*/
+  return retVal;
+}
+
+void ABWOutputDev::updateFont(GfxState *state) {
+  char buf[160];
+  double x,y;
+  xmlNodePtr N_cur;
+  GfxFont *font;
+  bool found = false;
+  xmlChar val[8];
+  bool isBold, isItalic, S_isBold, S_isItalic;
+  isBold = isItalic = S_isBold =  S_isItalic = false;
+  font = state->getFont();
+  GooString *ftName;
+  char *fnEnd, *fnName;
+  int fnLength, fnStart, ftSize;
+  //the first time this function is called there is no funt.
+  //Fixme: find out if that isn'y a bug
+  if (font){
+    isBold = (font->isBold() or font->getWeight() >6 or (strstr(font->getOrigName()->getCString(), "Bold")-font->getOrigName()->getCString() == (font->getOrigName()->getLength()-4)));
+    isItalic =  (font->isItalic() or (strstr(font->getOrigName()->getCString(), "Italic")-font->getOrigName()->getCString() == (font->getOrigName()->getLength()-6)));
+    ftSize = int(state->getTransformedFontSize())-1;
+    ftName = new GooString(font->getOrigName());
+    fnStart = strcspn(ftName->getCString(), "+");
+    if (fnStart < ftName->getLength())
+      ftName->del(0,fnStart+1);
+    fnEnd = strrchr(ftName->getCString(), 44);
+    if (fnEnd == 0)
+      fnEnd = strrchr(ftName->getCString(), 45);
+    if (fnEnd != 0)
+      ftName->del(fnEnd-ftName->getCString(),ftName->getLength()-1);
+    
+/*    fnName = ftName;
+    if (isBold or isItalic){
+      fnStart = strcspn(fnName, "+");
+      if (fnStart == font->getOrigName()->getLength())
+        fnStart = 0;
+      else fnStart++;
+
+      fnEnd = strstr(fnName, ",");
+      if (fnEnd == 0)
+        fnEnd = strstr(fnName, "-");
+      if (fnEnd != 0)
+        fnName[fnEnd-fnName] = 0;
+//      char fntName[fnLength];
+//      strncpy (fntName,fnName+fnStart+1,fnLength);
+      fnName+=fnStart;
+//      fnName = fntName;
+    }
+    else {*/
+      fnName = ftName->getCString();
+//    }
+    for (N_cur = N_styleset->children; N_cur; N_cur = N_cur ->next){
+      if (
+       isBold == (xmlStrcasecmp(xmlGetProp(N_cur,BAD_CAST "bold"),BAD_CAST "bold;") == 0)
+       and
+       isItalic == (xmlStrcasecmp(xmlGetProp(N_cur,BAD_CAST "italic"),BAD_CAST "italic") == 0)
+       and
+       xmlStrcasecmp(xmlGetProp(N_cur,BAD_CAST "font"),BAD_CAST fnName) == 0
+       and
+       xmlXPathCastStringToNumber(xmlGetProp(N_cur,BAD_CAST "size")) == ftSize
+      ) {
+        found = true;
+        Style = int(xmlXPathCastStringToNumber(xmlGetProp(N_cur,BAD_CAST "id")));
+      }
+    }
+    if (!found){
+      N_cur = xmlNewChild(N_styleset, NULL, BAD_CAST "s", NULL);
+      xmlSetProp(N_cur, BAD_CAST "type", BAD_CAST "P");
+      sprintf(buf, "%d", maxStyle++);
+      xmlSetProp(N_cur, BAD_CAST "name", BAD_CAST buf);
+      xmlSetProp(N_cur, BAD_CAST "id", BAD_CAST buf);
+      Style = maxStyle;
+      sprintf(buf, "%d", ftSize); xmlSetProp(N_cur, BAD_CAST "size", BAD_CAST buf);
+      isBold   ? xmlSetProp(N_cur, BAD_CAST "bold", BAD_CAST "bold;")  : xmlSetProp(N_cur, BAD_CAST "bold", BAD_CAST "normal;");
+      isItalic ? xmlSetProp(N_cur, BAD_CAST "italic", BAD_CAST "italic"): xmlSetProp(N_cur, BAD_CAST "italic", BAD_CAST "normal");
+      xmlSetProp(N_cur, BAD_CAST "font", BAD_CAST fnName);
+    }
+  }
+}
+
+void ABWOutputDev::drawChar(GfxState *state, double x, double y,
+			double dx, double dy,
+			double originX, double originY,
+			CharCode code, int nBytes, Unicode *u, int uLen)
+{
+  //I wouldn't know what size this should safely be. I guess 64 bytes should be
+  //enough for any unicode character
+  char buf[64];
+  int charLen;
+  x = dx;
+  y = dy;
+  //state->textTransformDelta(dx * state->getHorizScaling(), dy, &dx, &dy);
+  //state->transformDelta(dx, dy, &dx, &dy);
+  if (uLen == 1 && code == 0x20) {
+    //If we break a text sequence on space, then the X1 should be increased
+    //but the Y1 and Y2 should remain the same.
+    beginWord(state,X2+dx,Y2);
+  }
+  else {
+    X2    += dx;
+    Y2    += dy;
+    charLen = uMap->mapUnicode(*u,buf,sizeof(buf));
+    //Getting Unicode to libxml is something I need to fix.
+    //simply passing it using a bad-cast isn't working.
+    //I assume that CharCode code it the U+value of the unicode character
+    //But for a ligature code gives me DF which is the ringel-s, I guess
+    //code should be two bytes wide?
+    xmlNodeAddContentLen(N_word, BAD_CAST buf, charLen);
+  }
+}
+
+void ABWOutputDev::beginString(GfxState *state, GooString *s) {
+  double x,y;
+  //state->textTransform(x, y, &x, &y);
+  state->transform(state->getCurX(), state->getCurY(), &x, &y);
+  if (N_word) {
+    verDist = y-Y2;
+    horDist = x-X2;
+    //TEST:changed fabs(horDist) to horDist
+    //FIXME: this if statement seems awkward to me.
+    if (horDist > (state->getTransformedFontSize()*maxWordSpacing) or (fabs(verDist) > (state->getTransformedFontSize()/maxLineSpacingDelta))) {
+      beginTextBlock(state,x,y);
+    }
+    else {
+      if ((horDist > (state->getTransformedFontSize()*minWordBreakSpace)) or (fabs(verDist) > (state->getTransformedFontSize()/maxLineSpacingDelta))) {
+        beginWord(state,x,y);
+      }
+    }
+  }
+  else {
+  //This is the first word. Clear all values and call beginWord;
+    X2 = x;
+    Y2 = y;
+    horDist = 0;
+    verDist = 0;
+    height  = 0;
+    beginTextBlock(state,x,y);
+  }
+}
+
+void ABWOutputDev::endString(GfxState *state) {
+
+}
+
+void ABWOutputDev::beginWord(GfxState *state, double x, double y){
+  char buf[20];
+//  printf("***BREAK!***\n");
+  endWord();
+  X1 = x;
+  Y2 = y;
+
+  horDist = X1-X2;
+  verDist = Y1-Y2;
+
+  X2 = X1;
+  height = state->getFont()->getAscent() * state->getTransformedFontSize();
+  Y1 = Y2-height;
+
+  N_word = xmlNewChild(N_Block, NULL, BAD_CAST "word", NULL);
+  sprintf(buf, "%f", X1); xmlNewProp(N_word, BAD_CAST "X1", BAD_CAST buf);
+  sprintf(buf, "%f", Y1); xmlNewProp(N_word, BAD_CAST "Y1", BAD_CAST buf);
+  sprintf(buf, "%d", Style); xmlNewProp(N_word, BAD_CAST "style", BAD_CAST buf);
+}
+
+void ABWOutputDev::endWord(){
+  char buf[20];
+  if (N_word) {
+    sprintf(buf, "%f", X2);    xmlNewProp(N_word, BAD_CAST "X2", BAD_CAST buf);
+    sprintf(buf, "%f", Y2);    xmlNewProp(N_word, BAD_CAST "Y2", BAD_CAST buf);
+    sprintf(buf, "%f", X2-X1); xmlNewProp(N_word, BAD_CAST "width", BAD_CAST buf);
+    sprintf(buf, "%f", Y2-Y1); xmlNewProp(N_word, BAD_CAST "height", BAD_CAST buf);
+    N_word = NULL;
+  }
+}
+
+void ABWOutputDev::beginTextBlock(GfxState *state, double x, double y){
+  char buf[20];
+  endTextBlock();
+  N_Block = xmlNewChild(N_page, NULL, BAD_CAST "Textblock", NULL);
+  beginWord(state,x,y);
+}
+
+void ABWOutputDev::endTextBlock(){
+  char buf[20];
+  if (N_Block) {
+    endWord();
+    N_Block = NULL;  
+  }
+}
+/*
+This will be a function to retrieve coherent text blocks from the chunk tree.*/
+void ABWOutputDev::interpretXYTree(){
+  xmlNodePtr N_oldPage;
+  N_oldPage = N_page;
+  N_page = xmlNewNode(NULL, BAD_CAST "page");
+  N_column = N_page;
+  //xmlAddChild(N_content, N_page);
+  N_Block = xmlNewChild(N_column, NULL, BAD_CAST "chunk", NULL);
+  ATP_recursive(N_oldPage);
+}
+
+void ABWOutputDev::ATP_recursive(xmlNodePtr N_parent){
+  xmlNodePtr N_first, N_second, N_line, N_tempCol, N_tempColset;
+  N_first  = N_parent->children;
+  N_second = N_first->next;
+  char buf[20];
+/*
+  Possibilities: 
+  there is one child node
+    Because we cleaned up before the only case where we allow one childnode is 
+    within Textblocks and textBlocks within 'vertical' nodes.
+      basically one text node means: add it to the current block.
+  There are two childnodes
+    This can be two verticals, two horizontals or one horizontal and a text node.
+    verticals:
+      If the first is vertical, the second is as well.
+      verticals mean: create a new Block, add a column per vertical make the
+      vertical the block and recurse inside.
+      then make the second vertical the block and recurse inside
+      then finish the block (ie. create a new one)
+    horizontal and or Textblocks
+        if first is textnode
+          add first to block
+          if second is textnode
+            at to block
+          else
+            call again
+        else
+          begin new block
+            call again
+          begin new block
+          if second is text node
+            add to block
+          else
+            call again
+  there are more then two child nodes
+    this can be a number of Textblocks and horizontals
+    add the textNodes to the current Block
+    if a horizontal is encountered enter it and generate a new block afterwards
+  */
+  //fprintf(stderr,"**********************************************************************\n");
+  //xmlSaveFormatFileEnc("-", doc, "UTF-8", 1);
+  switch (xmlLsCountNode(N_parent)) {
+  case 1:
+    //fprintf(stderr,"case 1\n");
+    N_line = xmlNewChild(N_Block, NULL, BAD_CAST "line", NULL);
+    xmlUnlinkNode(N_first);
+    xmlAddChild(N_line, N_first);
+    break;
+  case 2:
+    //fprintf(stderr,"case 2\n");
+    if (xmlStrcasecmp(N_first->name,BAD_CAST "vertical") == 0){
+      //store the column for the moment
+      N_tempCol = N_column;
+      /*If we have three columns they will turn up in the tree as:
+      <vertical>
+        <vertical/>
+        <vertical/>
+      </vertical>
+      <vertical/>
+      */
+      //if the parent is a vertical as well, we can skip the colset generation 
+      //thing here we can also remove the just added column and block, because 
+      //these are going to replace them
+      if (xmlStrcasecmp(N_parent->name,BAD_CAST "vertical") != 0){
+        //fprintf(stderr,"first time column\n");
+        N_tempColset = N_colset;
+        N_colset = xmlNewChild(N_column, NULL, BAD_CAST "colset", NULL);
+        N_column = xmlNewChild(N_colset, NULL, BAD_CAST "column", NULL);
+        N_Block = xmlNewChild(N_column, NULL, BAD_CAST "chunk", NULL);
+      }
+      else {
+        //fprintf(stderr,"second time column\n");
+        xmlUnlinkNode(N_column);
+        N_column = xmlNewChild(N_colset, NULL, BAD_CAST "column", NULL);
+        N_Block = xmlNewChild(N_column, NULL, BAD_CAST "chunk", NULL);
+      }
+      //fprintf(stderr,"Building first column...\n");
+      ATP_recursive(N_first);
+      N_column = xmlNewChild(N_colset, NULL, BAD_CAST "column", NULL);
+      N_Block = xmlNewChild(N_column, NULL, BAD_CAST "chunk", NULL);
+      //fprintf(stderr,"Building second column...\n");
+      ATP_recursive(N_second);
+      //make sure we end the column by continuing in the master column and 
+      //setting the block and line to it
+      N_column = N_tempCol;
+      if (xmlStrcasecmp(N_parent->name,BAD_CAST "vertical") != 0){
+        N_colset = N_tempColset;
+      }
+    }
+    else {
+      if (xmlStrcasecmp(N_first->name,BAD_CAST "Textblock") == 0) {
+        //fprintf(stderr,"add first as textblock\n");
+        N_line = xmlNewChild(N_Block, NULL, BAD_CAST "line", NULL);
+        xmlUnlinkNode(N_first);
+        xmlAddChild(N_line, N_first);
+        if (xmlStrcasecmp(N_second->name,BAD_CAST "Textblock") == 0) {
+          //fprintf(stderr,"add second as textblock\n");
+          //FIXME: this is not neat. We should ignore the cut ignoring when there are only two elements above
+          //line aggregation doesn't work anyway atm.
+          xmlUnlinkNode(N_second);
+          xmlAddChild(N_line, N_second);
+          //We have two textChunks that are going to be added to the line.
+          //the following statements make the line wrap around both textblocks
+          //if the firstX1 is smaller then the second X1 use the first, else use the second etc.
+        }
+        else {
+          //fprintf(stderr,"recursing into second\n");
+          ATP_recursive(N_second);
+        }
+      }
+      else {
+        N_Block = xmlNewChild(N_column, NULL, BAD_CAST "chunk", NULL);
+        //fprintf(stderr,"recursing into first\n");
+        ATP_recursive(N_first);
+        N_Block = xmlNewChild(N_column, NULL, BAD_CAST "chunk", NULL);
+        if (xmlStrcasecmp(N_second->name,BAD_CAST "Textblock") == 0) {
+          //fprintf(stderr,"add second as textblock\n");
+          N_line = xmlNewChild(N_Block, NULL, BAD_CAST "line", NULL);
+          xmlUnlinkNode(N_second);
+          xmlAddChild(N_line, N_second);
+        }
+        else {
+          //fprintf(stderr,"recursing into second\n");
+          ATP_recursive(N_second);
+        }
+      }
+    }
+    break;
+  default:
+    double tX1=0, tX2=0, tY1=0, tY2=0;
+    //fprintf(stderr,"case default\n");
+    N_line = xmlNewChild(N_Block, NULL, BAD_CAST "line", NULL);
+    while (N_first){
+      //xmlXPathCastStringToNumber(xmlGetProp(N_first,BAD_CAST "X1")) < tX1 ? tX1 = xmlXPathCastStringToNumber(xmlGetProp(N_first,BAD_CAST "X1")) : tX1 = tX1;
+      //xmlXPathCastStringToNumber(xmlGetProp(N_first,BAD_CAST "X2")) > tX2 ? tX2 = xmlXPathCastStringToNumber(xmlGetProp(N_first,BAD_CAST "X2")) : tX2 = tX2;
+      //xmlXPathCastStringToNumber(xmlGetProp(N_first,BAD_CAST "Y1")) < tY1 ? tY1 = xmlXPathCastStringToNumber(xmlGetProp(N_first,BAD_CAST "Y1")) : tY1 = tY1;
+      //xmlXPathCastStringToNumber(xmlGetProp(N_first,BAD_CAST "Y2")) > tY2 ? tY2 = xmlXPathCastStringToNumber(xmlGetProp(N_first,BAD_CAST "Y2")) : tY1 = tY2;
+      N_second = N_first->next;
+      if (xmlStrcasecmp(N_first->name,BAD_CAST "Textblock") == 0){
+        xmlUnlinkNode(N_first);
+        xmlAddChild(N_line, N_first);
+      }
+      else { //fprintf(stderr,"This shouldn't happen! (line 700)\n");
+      }
+      N_first = N_second;
+    }
+    break;
+  }
+}
+
+/*The cleanup function. It started out as a simple function to remove empty nodes
+so that I could call xmladdnewchildnode as often as I liked so that I wouldn't get seg-faults
+It is now a bit more advanced, makes sure the tree is as it's supposed to be and adds information too*/
+void ABWOutputDev::cleanUpNode(xmlNodePtr N_parent, bool aggregateInfo){
+  double tX1=-1, tX2=-1, tY1=-1, tY2=-1;
+  xmlNodePtr N_cur, N_next;
+  N_cur = N_parent->children;
+  char buf[20];
+  int prevStyle = -1;
+  xmlChar *val;
+  int styleLength = xmlLsCountNode(N_styleset)+1;
+  float stylePos;
+  int styles[styleLength];
+  for (int i=1; i< styleLength; i++) { styles[i] = 0;}
+  /*
+  ignore two horizontal nodes with textBlocks right underneath them. They 
+  signal the end of a chunk, and the horizontal seperation needs to be 
+  preserved, because it means they are different lines. The second horizontal 
+  therefore needs to be kept.
+  */
+  if ((xmlLsCountNode(N_parent) == 2)
+      and
+     xmlStrcasecmp(N_parent->name,BAD_CAST "horizontal") == 0
+      and 
+     N_cur
+      and
+     N_cur->next
+      and
+     xmlStrcasecmp(N_cur->name,BAD_CAST "horizontal") == 0 and xmlStrcasecmp(N_cur->next->name,BAD_CAST "horizontal") == 0
+      and
+     xmlLsCountNode(N_cur) == 1 and xmlLsCountNode(N_cur->next) == 1
+      and
+     xmlStrcasecmp(N_cur->children->name,BAD_CAST "Textblock") == 0 and xmlStrcasecmp(N_cur->next->children->name,BAD_CAST "Textblock") == 0
+     ) {
+    xmlAddPrevSibling(N_cur->next,N_cur->children); 
+    xmlUnlinkNode(N_cur);
+  } 
+  /*
+  This removes columns if one of the parts is actually a single letter.
+  I found out I liked the columns better, so I have the code commented out.
+  */
+/*  else if ((xmlLsCountNode(N_parent) == 2)
+             and
+            N_cur
+             and
+            N_cur->next
+             and 
+            xmlStrcasecmp(N_cur->name,BAD_CAST "vertical") == 0
+             and
+            xmlStrcasecmp(N_cur->next->name,BAD_CAST "vertical") == 0
+             and 
+            (N_cur->children) 
+             and
+            (N_cur->children->children)
+             and
+            (N_cur->children->children->children)
+             and
+            xmlStrlen(N_cur->children->children->children->content) == 1) {
+    N_next = N_cur->next;
+    xmlAddChild(N_parent, N_next->children);
+    xmlAddPrevSibling(N_next->children->children, N_cur->children);
+    xmlUnlinkNode(N_cur);
+    xmlUnlinkNode(N_next);
+  } */else {
+    while (N_cur){
+      N_next = N_cur->next;
+      cleanUpNode(N_cur, aggregateInfo);
+      if (xmlLsCountNode(N_cur) == 0 and (xmlStrcasecmp(N_cur->name,BAD_CAST "cbr") != 0) and (xmlStrcasecmp(N_cur->name,BAD_CAST "s") != 0))
+        xmlUnlinkNode(N_cur);
+      //If the node is still around
+      N_cur = N_next;
+    }
+  }
+  //If a countainer element has only one child, it can be removed except for vertical
+  //cuts with only one textElement;
+  //the main reason for this code is to remove the crumbs after cleaning up in the loop above
+  if ((xmlLsCountNode(N_parent) == 1) and ((xmlStrcasecmp(N_parent->name,BAD_CAST "horizontal") == 0) or ((xmlStrcasecmp(N_parent->name,BAD_CAST "vertical") == 0) and (xmlStrcasecmp(N_parent->children->name,BAD_CAST "Textblock") != 0)))){
+    N_cur = N_parent->children;
+    xmlAddPrevSibling(N_parent,N_cur);
+    xmlUnlinkNode(N_parent);
+  }
+  //We cannot remove the page element so if it has only one childnode, we remove that childnode instead
+  if ((xmlStrcasecmp(N_parent->name,BAD_CAST "page") == 0) and (xmlLsCountNode(N_parent) == 1)) {
+    N_cur = N_parent->children->children;
+    while (N_cur){
+      N_next = N_cur->next;
+      xmlUnlinkNode(N_cur);
+      xmlAddChild(N_parent, N_cur);
+      N_cur = N_next;
+    }
+    xmlUnlinkNode(N_parent->children);
+  }
+  //Ok, so by this time the N_parent and his children are guaranteed to be clean
+  //this for loop gets information from the 'word' elements and propagates it up
+  //the tree. 
+  if (aggregateInfo and xmlStrcasecmp(N_parent->name,BAD_CAST "word") != 0) {
+    for (N_cur = N_parent->children; N_cur; N_cur = N_cur->next){
+      val = xmlGetProp(N_cur,BAD_CAST "style");
+      stylePos = xmlXPathCastStringToNumber(val);
+      //fprintf(stderr,"1: %f, %d\n",stylePos,int(stylePos));
+      styles[int(stylePos)]=styles[int(stylePos)]+1;
+      //fprintf(stderr,"2: styles[%d] = %d\n",int(stylePos),styles[int(stylePos)]);
+      (xmlXPathCastStringToNumber(xmlGetProp(N_cur,BAD_CAST "X1")) < tX1 or tX1 == -1)? tX1 = xmlXPathCastStringToNumber(xmlGetProp(N_cur,BAD_CAST "X1")) : tX1 = tX1;
+      (xmlXPathCastStringToNumber(xmlGetProp(N_cur,BAD_CAST "X2")) > tX2)             ? tX2 = xmlXPathCastStringToNumber(xmlGetProp(N_cur,BAD_CAST "X2")) : tX2 = tX2;
+      (xmlXPathCastStringToNumber(xmlGetProp(N_cur,BAD_CAST "Y1")) < tY1 or tY1 == -1)? tY1 = xmlXPathCastStringToNumber(xmlGetProp(N_cur,BAD_CAST "Y1")) : tY1 = tY1;
+      (xmlXPathCastStringToNumber(xmlGetProp(N_cur,BAD_CAST "Y2")) > tY2)             ? tY2 = xmlXPathCastStringToNumber(xmlGetProp(N_cur,BAD_CAST "Y2")) : tY2 = tY2;
+    }
+    sprintf(buf, "%f", tX1);     xmlSetProp(N_parent, BAD_CAST "X1", BAD_CAST buf);
+    sprintf(buf, "%f", tX2);     xmlSetProp(N_parent, BAD_CAST "X2", BAD_CAST buf);
+    sprintf(buf, "%f", tY1);     xmlSetProp(N_parent, BAD_CAST "Y1", BAD_CAST buf);
+    sprintf(buf, "%f", tY2);     xmlSetProp(N_parent, BAD_CAST "Y2", BAD_CAST buf);
+    sprintf(buf, "%f", tX2-tX1); xmlSetProp(N_parent, BAD_CAST "width", BAD_CAST buf);
+    sprintf(buf, "%f", tY2-tY1); xmlSetProp(N_parent, BAD_CAST "height", BAD_CAST buf);
+    prevStyle = 0;
+    styles[0] = -1;
+    for (int i=1; i< styleLength; i++) { if (styles[i] > styles[prevStyle]) prevStyle = i; }
+    //fprintf(stderr,"%d\n", prevStyle);
+    if (prevStyle > 0){
+      sprintf(buf, "%d", prevStyle);     xmlSetProp(N_parent, BAD_CAST "style", BAD_CAST buf);
+    }
+  }
+  if (N_parent->children and xmlStrcasecmp(N_parent->children->name,BAD_CAST "line") == 0 and xmlGetProp(N_parent->children,BAD_CAST "alignment") != NULL)
+    xmlSetProp(N_parent, BAD_CAST "alignment", xmlGetProp(N_parent->children,BAD_CAST "alignment"));
+}
+
+void ABWOutputDev::generateParagraphs() {
+  xmlNodePtr N_cur, N_parent, N_p, N_line, N_next;
+  int lvl;
+  //basically I first detect the text-alignment within blocks.
+  //ASSUMPTION: my block seperation thing is good enough so I don't need to
+  //worry about two alignments in one paragraph
+  
+  X1 = 0;
+  X2 = pdfdoc->getPageCropWidth(G_pageNum);
+  Y1 = 0;
+  Y2 = pdfdoc->getPageCropHeight(G_pageNum);
+  addAlignment(N_page);
+  
+  //then it's a switch per alignement
+  N_cur = N_page->children;
+  N_parent = N_page;
+  lvl = 1;
+  while (N_cur) {
+    if (xmlStrcasecmp(N_cur->name,BAD_CAST "chunk") == 0){
+      N_p = xmlNewNode(NULL, BAD_CAST "chunk");
+      xmlAddPrevSibling(N_cur,N_p);
+      //N_p = xmlNewChild(N_parent, NULL, BAD_CAST "chunk", NULL);
+      //A new paragraph is created when:
+      switch (int(xmlXPathCastStringToNumber(xmlGetProp(N_cur,BAD_CAST "alignment")))){
+      //left
+      case 1: //the distance between the texblock X2 and the last word X2 is more than
+         //the following first word width.
+         N_line = N_cur->children;
+         while (N_line){
+           N_next = N_line->next;
+           xmlUnlinkNode(N_line);
+           xmlAddChild(N_p,N_line);
+           xmlSetProp(N_line, BAD_CAST "alignment", BAD_CAST "1");
+           if (N_next and xmlStrcasecmp(N_next->name,BAD_CAST "line") == 0){
+             if (xmlXPathCastStringToNumber(xmlGetProp(N_next->children->children,BAD_CAST "width")) < (xmlXPathCastStringToNumber(xmlGetProp(N_cur,BAD_CAST "width")) - xmlXPathCastStringToNumber(xmlGetProp(N_line,BAD_CAST "width")))){
+               N_p = xmlNewNode(NULL, BAD_CAST "chunk");
+               xmlAddPrevSibling(N_cur,N_p);
+             }
+           }
+           N_line = N_next;
+         }
+         break;
+      //right
+      case 2: //the same but now with X1 and first word and following last word
+         N_line = N_cur->children;
+         while (N_line){
+           N_next = N_line->next;
+           xmlUnlinkNode(N_line);
+           xmlAddChild(N_p,N_line);
+           xmlSetProp(N_line, BAD_CAST "alignment", BAD_CAST "2");
+           if (N_next and xmlStrcasecmp(N_next->name,BAD_CAST "line") == 0){
+             //fprintf(stderr,"width_next=%f, X2_bl=%f, X2_w=%f\n",xmlXPathCastStringToNumber(xmlGetProp(N_next->children->children,BAD_CAST "width")),xmlXPathCastStringToNumber(xmlGetProp(N_cur,BAD_CAST "width")),xmlXPathCastStringToNumber(xmlGetProp(N_line,BAD_CAST "width")));
+             if (xmlXPathCastStringToNumber(xmlGetProp(N_next->children->children,BAD_CAST "width")) < (xmlXPathCastStringToNumber(xmlGetProp(N_cur,BAD_CAST "width")) - xmlXPathCastStringToNumber(xmlGetProp(N_line,BAD_CAST "width")))){
+               N_p = xmlNewNode(NULL, BAD_CAST "chunk");
+               xmlAddPrevSibling(N_cur,N_p);
+             }
+           }
+           N_line = N_next;
+         }
+         break;
+      //centered
+      case 3: //the combined left and right space is more than the following first word
+         N_line = N_cur->children;
+         while (N_line){
+           N_next = N_line->next;
+           xmlUnlinkNode(N_line);
+           xmlAddChild(N_p,N_line);
+           xmlSetProp(N_line, BAD_CAST "alignment", BAD_CAST "3");
+           if (N_next and xmlStrcasecmp(N_next->name,BAD_CAST "line") == 0){
+             //fprintf(stderr,"width_next=%f, X2_bl=%f, X2_w=%f\n",xmlXPathCastStringToNumber(xmlGetProp(N_next->children->children,BAD_CAST "width")),xmlXPathCastStringToNumber(xmlGetProp(N_cur,BAD_CAST "width")),xmlXPathCastStringToNumber(xmlGetProp(N_line,BAD_CAST "width")));
+             if (xmlXPathCastStringToNumber(xmlGetProp(N_next->children->children,BAD_CAST "width")) < (xmlXPathCastStringToNumber(xmlGetProp(N_cur,BAD_CAST "width")) - xmlXPathCastStringToNumber(xmlGetProp(N_line,BAD_CAST "width")))){
+               N_p = xmlNewNode(NULL, BAD_CAST "chunk");
+               xmlAddPrevSibling(N_cur,N_p);
+             }
+           }
+           N_line = N_next;
+         }
+         break;
+      //justified
+      case 4:
+         //we break on all alignment=1 lines. A line with alignment=1 that is the first of a block will
+         //also initiate a paragraph break before.
+         N_line = N_cur->children;
+         if (xmlXPathCastStringToNumber(xmlGetProp(N_line,BAD_CAST "alignment")) == 1){
+           N_p = xmlNewNode(NULL, BAD_CAST "chunk");
+           xmlAddPrevSibling(N_cur,N_p);
+         }
+         while (N_line){
+           N_next = N_line->next;
+           xmlUnlinkNode(N_line);
+           xmlAddChild(N_p,N_line);
+           if (xmlXPathCastStringToNumber(xmlGetProp(N_line,BAD_CAST "alignment")) == 1){
+             N_p = xmlNewNode(NULL, BAD_CAST "chunk");
+             xmlAddPrevSibling(N_cur,N_p);
+           }
+           xmlSetProp(N_line, BAD_CAST "alignment", BAD_CAST "4");
+           N_line = N_next;
+         }
+         break;
+      }
+    }
+    else if (xmlStrcasecmp(N_cur->name,BAD_CAST "colset") == 0 or xmlStrcasecmp(N_cur->name,BAD_CAST "column") == 0){
+      N_parent = N_cur;
+      N_cur = N_cur->children;
+      lvl++;
+      N_p = xmlNewNode(NULL, BAD_CAST "chunk");
+      xmlAddPrevSibling(N_cur,N_p);
+      continue;
+    }
+    if (N_cur->next)
+      N_cur = N_cur->next;
+    else while (lvl > 0){
+      N_cur = N_parent;
+      N_parent = N_cur->parent;
+      lvl--;
+      if (N_cur->next){
+        N_cur = N_cur->next;
+        break;
+      }
+    }
+    if (lvl==0)
+      N_cur = NULL;
+  }
+}
+
+//function that adds an 'alignment=' property to the <chunk>s
+void ABWOutputDev::addAlignment(xmlNodePtr N_parent) {
+  xmlNodePtr N_chunk, N_line;
+  double tX1, tX2;
+  bool leftMatch, rightMatch, centerMatch;
+  int leftCnt = 0, rightCnt = 0, cntrCnt = 0, justCnt = 0;
+  //fprintf(stderr,"Entering addAlignment\n");
+  for (N_chunk = N_parent->children; N_chunk; N_chunk = N_chunk->next) {
+    if (xmlStrcasecmp(N_chunk->name,BAD_CAST "chunk") == 0){
+      X1 = xmlXPathCastStringToNumber(xmlGetProp(N_chunk,BAD_CAST "X1"));
+      X2 = xmlXPathCastStringToNumber(xmlGetProp(N_chunk,BAD_CAST "X2"));
+      //fprintf(stderr,"Found chunk\n");
+      //if the chunk contains only one line, we don't need to loop through it.
+      if (xmlLsCountNode(N_chunk) == 1){
+        //fprintf(stderr,"Processing line\n");
+        //fprintf(stderr,"X1=%f, X2=%f, cX1=%f, cX2=%f\n",X1,X2,xmlXPathCastStringToNumber(xmlGetProp(N_chunk,BAD_CAST "X1")), xmlXPathCastStringToNumber(xmlGetProp(N_chunk,BAD_CAST "X2")));
+        //fprintf(stderr,"%f\n",(xmlXPathCastStringToNumber(xmlGetProp(N_chunk,BAD_CAST "X1")) - X1)-(X2 - xmlXPathCastStringToNumber(xmlGetProp(N_chunk,BAD_CAST "X2"))));
+        //fprintf(stderr,"cX1-X1=%f, X2-cX2=%f\n",(xmlXPathCastStringToNumber(xmlGetProp(N_chunk,BAD_CAST "X1")) - X1),(X2 - xmlXPathCastStringToNumber(xmlGetProp(N_chunk,BAD_CAST "X2"))));
+        // a one line chunk, is either centered or left or right-aligned.
+        if ((xmlXPathCastStringToNumber(xmlGetProp(N_chunk,BAD_CAST "X1"))-X1)-(X2-xmlXPathCastStringToNumber(xmlGetProp(N_chunk,BAD_CAST "X2"))) > 1) {
+          xmlNewProp(N_chunk, BAD_CAST "alignment", BAD_CAST "2");
+          xmlNewProp(N_chunk->children, BAD_CAST "alignment", BAD_CAST "2");
+          //fprintf(stderr,"alignment = right\n");
+        }
+        else { 
+        if ((xmlXPathCastStringToNumber(xmlGetProp(N_chunk,BAD_CAST "X1"))-X1)-(X2 - xmlXPathCastStringToNumber(xmlGetProp(N_chunk,BAD_CAST "X2")))< -1) {
+          xmlNewProp(N_chunk, BAD_CAST "alignment", BAD_CAST "1");
+          xmlNewProp(N_chunk->children, BAD_CAST "alignment", BAD_CAST "1");
+          //fprintf(stderr,"alignment = left\n");
+        }
+        else {
+          xmlNewProp(N_chunk, BAD_CAST "alignment", BAD_CAST "3");
+          xmlNewProp(N_chunk->children, BAD_CAST "alignment", BAD_CAST "3");
+          //fprintf(stderr,"alignment = center\n");
+        }
+        }
+      }
+      else {
+      leftCnt = 0;
+      rightCnt = 0;
+      cntrCnt = 0;
+      justCnt = 0;
+      for (N_line = N_chunk->children; N_line; N_line = N_line->next) {
+        //fprintf(stderr,"Processing line\n");
+        /*
+        |X1 - cX1| == 1
+        |X2 - cX2| == 1
+        |(cX1-X1)-(X2-cX2)| == 1
+        ok, each line can be just as wide as the current set,
+        it can be smaller and moved to the right
+        it can be smaller and moved to the left.
+        it can 
+        */
+        //fprintf(stderr,"X1=%f, X2=%f, cX1=%f, cX2=%f\n",X1,X2,xmlXPathCastStringToNumber(xmlGetProp(N_line,BAD_CAST "X1")), xmlXPathCastStringToNumber(xmlGetProp(N_line,BAD_CAST "X2")));
+        //fprintf(stderr,"cX1-X1=%f, X2-cX2=%f\n",(xmlXPathCastStringToNumber(xmlGetProp(N_line,BAD_CAST "X1")) - X1),(X2 - xmlXPathCastStringToNumber(xmlGetProp(N_line,BAD_CAST "X2"))));
+        leftMatch =  fabs(xmlXPathCastStringToNumber(xmlGetProp(N_line,BAD_CAST "X1"))-X1) < 2;
+        rightMatch =  fabs(X2-xmlXPathCastStringToNumber(xmlGetProp(N_line,BAD_CAST "X2"))) < 2;
+        centerMatch =  fabs((xmlXPathCastStringToNumber(xmlGetProp(N_line,BAD_CAST "X1"))-X1)-(X2-xmlXPathCastStringToNumber(xmlGetProp(N_line,BAD_CAST "X2")))) < 2;
+        if (leftMatch and rightMatch) {
+          xmlNewProp(N_line, BAD_CAST "alignment", BAD_CAST "4");
+          justCnt++;
+        }
+        else if (centerMatch) {
+          xmlNewProp(N_line, BAD_CAST "alignment", BAD_CAST "3");
+          cntrCnt++;
+        }
+        else if (rightMatch) {
+          xmlNewProp(N_line, BAD_CAST "alignment", BAD_CAST "2");
+          rightCnt++;
+        }
+        else {
+          xmlNewProp(N_line, BAD_CAST "alignment", BAD_CAST "1");
+          leftCnt++;
+        }
+      }
+      //there is almost always one justified line in a centered text
+      //and most justified blocks have at least one left aligned line
+      //fprintf(stderr,"1:%d ,2:%d ,3:%d ,4:%d\n",leftCnt,justCnt,cntrCnt,rightCnt);
+      if ((leftCnt-1 >= justCnt) and (leftCnt >= rightCnt) and (leftCnt >= cntrCnt))
+        xmlNewProp(N_chunk, BAD_CAST "alignment", BAD_CAST "1");
+      else if ((justCnt >= leftCnt-1) and (justCnt >= rightCnt) and (justCnt >= cntrCnt))
+        xmlNewProp(N_chunk, BAD_CAST "alignment", BAD_CAST "4");
+      else if ((cntrCnt >= justCnt-1) and (cntrCnt >= rightCnt) and (cntrCnt >= leftCnt))
+        xmlNewProp(N_chunk, BAD_CAST "alignment", BAD_CAST "3");
+      else
+        xmlNewProp(N_chunk, BAD_CAST "alignment", BAD_CAST "2");
+      }
+    } 
+    else {
+      if (xmlStrcasecmp(N_chunk->name,BAD_CAST "colset") == 0){
+        //fprintf(stderr,"Found a colset\n");
+        addAlignment(N_chunk);
+      }
+      else {
+        if (xmlStrcasecmp(N_chunk->name,BAD_CAST "column") == 0){
+          //fprintf(stderr,"Found a column\n");
+          tX1 = X1;
+          tX2 = X2;
+          X1 = xmlXPathCastStringToNumber(xmlGetProp(N_chunk,BAD_CAST "X1"));
+          X2 = xmlXPathCastStringToNumber(xmlGetProp(N_chunk,BAD_CAST "X2"));
+          addAlignment(N_chunk);
+          X1 = tX1;
+          X2 = tX2;
+        }
+        else { //fprintf(stderr,"Found something else\n");
+	}
+      }
+    }
+  }
+//parse all blocks, and all lines within all blocks
+//do a set of checks and tick a flag if the check fails
+//check for line X1 is textBlock X1
+//check for line X2 is textblock X2
+//check if line is centered in textBock (LX1 != TX1 && LX2 != TX2 && LX1-TX1 == TX2=LX2)
+//if the LX1 != TX1 then how much is the difference?
+//a line isn't left aligned if all lines have a different X1 <= not so strong assumption.
+
+//justified if both are straight except for a couple of (same factor sized) indents at the left
+//else centered if above calculation is correct
+//else left aligned if left side is more straight than right (more lines in the same X1 or common factor
+//else right
+}
+
+void ABWOutputDev::setPDFDoc(PDFDoc *priv_pdfdoc) {
+  pdfdoc = priv_pdfdoc;
+}
+
+void ABWOutputDev::createABW() {
+  //*************************************************************
+  //change styles to abiword format
+  xmlNodePtr N_cur, N_next;
+  xmlAttrPtr N_prop;
+  char buf[500];
+  for (N_cur = N_styleset->children; N_cur; N_cur = N_cur->next){
+    sprintf(buf,"margin-top:0pt; color:000000; margin-left:0pt; text-position:normal; widows:2; text-indent:0in; font-variant:normal; margin-right:0pt; lang:nl-NL; line-height:1.0; font-size:%dpt; text-decoration:none; margin-bottom:0pt; bgcolor:transparent; text-align:left; font-stretch:normal;",int(xmlXPathCastStringToNumber(xmlGetProp(N_cur,BAD_CAST "size"))));
+    strncat(buf,"font-family:",12);
+    strncat(buf,(char *)xmlGetProp(N_cur,BAD_CAST "font"),strlen((char *)xmlGetProp(N_cur,BAD_CAST "font")));
+    strncat(buf,";",1);
+    strncat(buf,"font-weight:",12);
+    strncat(buf,(char *)xmlGetProp(N_cur,BAD_CAST "bold"),strlen((char *)xmlGetProp(N_cur,BAD_CAST "bold")));
+    strncat(buf,"font-style:",12);
+    strncat(buf,(char *)xmlGetProp(N_cur,BAD_CAST "italic"),strlen((char *)xmlGetProp(N_cur,BAD_CAST "italic")));
+    xmlSetProp(N_cur, BAD_CAST "props", BAD_CAST buf);
+    N_prop = xmlHasProp(N_cur, BAD_CAST "id");
+    if (N_prop != NULL) xmlRemoveProp(N_prop);
+    N_prop = xmlHasProp(N_cur, BAD_CAST "size");
+    if (N_prop != NULL) xmlRemoveProp(N_prop);
+    N_prop = xmlHasProp(N_cur, BAD_CAST "bold");
+    if (N_prop != NULL) xmlRemoveProp(N_prop);
+    N_prop = xmlHasProp(N_cur, BAD_CAST "italic");
+    if (N_prop != NULL) xmlRemoveProp(N_prop);
+    N_prop = xmlHasProp(N_cur, BAD_CAST "font");
+    if (N_prop != NULL) xmlRemoveProp(N_prop);
+  }
+  //*************************************************************
+  //Change the rest of the document
+  //each child of N_content is a page
+  N_cur = N_content->children;
+  while (N_cur){
+    //we creat a section node and attach it to the root, it will com after all
+    //the page nodes. Then we transform the page, and finally remove it
+    N_next = N_cur->next;
+    //fprintf(stderr,"***Transforming page\n");
+    N_Block = xmlNewChild(N_root, NULL, BAD_CAST "section", NULL);
+    transformPage(N_cur);
+    xmlUnlinkNode(N_cur);
+    //fprintf(stderr,"***Finished transforming page\n");
+    N_cur = N_next;
+  }
+  cleanUpNode(N_root, false);
+}
+
+void ABWOutputDev::transformPage(xmlNodePtr N_parent){
+  char buf[60];
+  xmlNodePtr N_cur, N_curLine, N_curText, N_curWord, text, space;
+  //translate the nodes into abiword nodes
+  if (xmlStrcasecmp(N_parent->name,BAD_CAST "page") == 0){
+    for (N_cur = N_parent->children; N_cur; N_cur = N_cur->next){
+      //fprintf(stderr,"**pass a page child\n");
+      transformPage(N_cur);
+    }
+  }
+  if (xmlStrcasecmp(N_parent->name,BAD_CAST "chunk") == 0){
+    //fprintf(stderr,"Found a chunk\n");
+    //I start a <p> on each chunk and add all word containment
+    N_text = xmlNewChild(N_Block, NULL, BAD_CAST "p", NULL);
+    if (int(xmlXPathCastStringToNumber(xmlGetProp(N_parent,BAD_CAST "style"))) > 0){
+      xmlNewProp(N_text, BAD_CAST "style", xmlGetProp(N_parent,BAD_CAST "style"));
+    }
+    switch (int(xmlXPathCastStringToNumber(xmlGetProp(N_parent,BAD_CAST "alignment")))){
+    case 1: xmlNewProp(N_text, BAD_CAST "props", BAD_CAST "text-align:left");
+           break;
+    case 2: xmlNewProp(N_text, BAD_CAST "props", BAD_CAST "text-align:right");
+           break;
+    case 3: xmlNewProp(N_text, BAD_CAST "props", BAD_CAST "text-align:center");
+           break;
+    case 4: xmlNewProp(N_text, BAD_CAST "props", BAD_CAST "text-align:justify");
+           break;
+    }
+    for (N_curLine = N_parent->children; N_curLine; N_curLine = N_curLine->next){
+      //fprintf(stderr,"A line\n");
+      for (N_curText = N_curLine->children; N_curText; N_curText = N_curText->next){
+        //fprintf(stderr,"a textNode\n");
+        for (N_curWord = N_curText->children; N_curWord; N_curWord = N_curWord->next){
+          //fprintf(stderr,"a word\n");
+          text = N_curWord->children;
+          xmlUnlinkNode(text);
+          xmlAddChild(N_text,text);
+          space = xmlNewText(BAD_CAST " ");
+          xmlAddChild(N_text,space);
+        }
+      }
+    }
+  }
+  if (xmlStrcasecmp(N_parent->name,BAD_CAST "column") == 0){
+    //fprintf(stderr,"Found a column\n");
+    for (N_cur = N_parent->children; N_cur; N_cur = N_cur->next){
+      transformPage(N_cur);
+    }
+    xmlNewChild(N_text, NULL, BAD_CAST "cbr", NULL);
+  }
+  if (xmlStrcasecmp(N_parent->name,BAD_CAST "colset") == 0){
+    //fprintf(stderr,"Found a colset\n");
+    //create new section columns: count childNodes of N_cur
+    //recurse through chunks and create textNodes
+    N_Block = xmlNewChild(N_root, NULL, BAD_CAST "section", NULL);
+    sprintf(buf,"columns:%d",xmlLsCountNode(N_parent));
+    xmlNewProp(N_Block, BAD_CAST "props", BAD_CAST buf);
+    for (N_cur = N_parent->children; N_cur; N_cur = N_cur->next){
+      transformPage(N_cur);
+    }
+    N_Block = xmlNewChild(N_root, NULL, BAD_CAST "section", NULL);
+  }
+  //fprintf(stderr,"at the end\n");
+}
diff -urN -x CVS poppler-abiword-no/poppler/ABWOutputDev.h poppler-abiword/poppler/ABWOutputDev.h
--- poppler-abiword-no/poppler/ABWOutputDev.h	1969-12-31 19:00:00.000000000 -0500
+++ poppler-abiword/poppler/ABWOutputDev.h	2007-03-31 11:34:08.000000000 -0400
@@ -0,0 +1,140 @@
+//========================================================================
+//
+// ABWOutputDev.h
+//
+// Copyright 2006 Jauco Noordzij
+//
+//========================================================================
+
+#ifndef ABWOUTPUTDEV_H
+#define ABWOUTPUTDEV_H
+
+#ifdef __GNUC__
+#pragma interface
+#endif
+
+#include <stdio.h>
+#include "goo/gtypes.h"
+#include "goo/GooList.h"
+#include "GfxFont.h"
+#include "OutputDev.h"
+#include "Link.h"
+#include "Catalog.h"
+#include "UnicodeMap.h"
+#include "PDFDoc.h"
+
+#include <libxml/parser.h>
+#include <libxml/tree.h>
+#include <libxml/xpath.h>
+#include <libxml/xpathInternals.h>
+
+#ifdef WIN32
+#  define SLASH '\\'
+#else
+#  define SLASH '/'
+#endif
+
+#define xoutRound(x) ((int)(x + 0.5))
+
+class GfxState;
+class GooString;
+
+//------------------------------------------------------------------------
+// ABWOutputDev
+//------------------------------------------------------------------------
+
+class ABWOutputDev: public OutputDev {
+public:
+
+  // Open a text output file.  If <fileName> is NULL, no file is written
+  // (this is useful, e.g., for searching text).  If <useASCII7> is true,
+  // text is converted to 7-bit ASCII; otherwise, text is converted to
+  // 8-bit ISO Latin-1.  <useASCII7> should also be set for Japanese
+  // (EUC-JP) text.  If <rawOrder> is true, the text is kept in content
+  // stream order.
+  ABWOutputDev(xmlDocPtr ext_doc);
+
+  // Destructor.
+  virtual ~ABWOutputDev();
+
+  // Check if file was successfully created.
+  virtual GBool isOk() { return gTrue; }
+
+  //---- get info about output device
+
+  // Does this device use upside-down coordinates?
+  // (Upside-down means (0,0) is the top left corner of the page.)
+  virtual GBool upsideDown() { return gTrue; }
+
+  // Does this device use drawChar() or drawString()?
+  virtual GBool useDrawChar() { return gTrue; }
+
+  // Does this device use beginType3Char/endType3Char?  Otherwise,
+  // text in Type 3 fonts will be drawn with drawChar/drawString.
+  virtual GBool interpretType3Chars() { return gFalse; }
+
+  // Does this device need non-text content?
+  virtual GBool needNonText() { return gFalse; }
+
+  //----- initialization and control
+
+  // Start a page.
+  virtual void startPage(int pageNum, GfxState *state);
+
+  // End a page.
+  virtual void endPage();
+
+  //----- update text state
+  virtual void updateFont(GfxState *state);
+
+  //----- text drawing
+  //new feature    
+  virtual int DevType() {return 1234;}
+
+  int getPageWidth() { return maxPageWidth; }
+  int getPageHeight() { return maxPageHeight; }
+  float getBiggestSeperator(xmlNodePtr N_set, unsigned int direction, float * C1, float * C2);
+  void recursiveXYC(xmlNodePtr nodeset);
+  void splitNodes(float splitValue, unsigned int direction, xmlNodePtr N_parent, double extravalue);
+  virtual void beginString(GfxState *state, GooString *s);
+  virtual void endString(GfxState *state);
+  virtual void drawChar(GfxState *state, double x, double y,
+			double dx, double dy,
+			double originX, double originY,
+			CharCode code, int nBytes, Unicode *u, int uLen);
+  void beginWord(GfxState *state, double x, double y);
+  void endWord();
+  void beginTextBlock(GfxState *state, double x, double y);
+  void endTextBlock();
+  void interpretXYTree();
+  void ATP_recursive(xmlNodePtr N_cur);
+  void cleanUpNode(xmlNodePtr N_parent, bool aggregateInfo);
+  void transformPage(xmlNodePtr N_parent);
+  void generateParagraphs();
+  void addAlignment(xmlNodePtr N_parent);
+  void setPDFDoc(PDFDoc *priv_pdfdoc);
+  void createABW();
+
+private:
+  int maxPageWidth;
+  int maxPageHeight;
+  int G_pageNum;
+  int Style, maxStyle;
+  //A lot of values are nice to have around. I think that declaring some 
+  //global variables that contain these values is faster & easier than reading
+  //them from the xml tree every time.
+  double height;
+  double wordSpace, charSpace;
+  double X1,X2,Y1,Y2,horDist, verDist, curDx, curDy;
+  bool mightBreak;
+  xmlDocPtr doc;
+  /* node pointers */
+  xmlNodePtr N_root, N_content, N_page, N_style, N_text, N_styleset, N_Block, N_word, N_column, N_colset;
+  xmlNodePtr outputDoc;
+  xmlXPathContextPtr xpathCtx;
+  static const unsigned int HORIZONTAL = 0;
+  static const unsigned int VERTICAL = 1;
+  UnicodeMap *uMap;
+  PDFDoc *pdfdoc;
+};
+#endif
diff -urN -x CVS poppler-abiword-no/poppler/Makefile.am poppler-abiword/poppler/Makefile.am
--- poppler-abiword-no/poppler/Makefile.am	2007-04-01 14:30:01.454606936 -0400
+++ poppler-abiword/poppler/Makefile.am	2007-03-31 11:34:08.000000000 -0400
@@ -80,7 +80,8 @@
 	$(cairo_includes)			\
 	$(arthur_includes)			\
 	$(FREETYPE_CFLAGS)			\
-	$(FONTCONFIG_CFLAGS)
+	$(FONTCONFIG_CFLAGS)			\
+	$(LIBXML_CFLAGS)
 
 lib_LTLIBRARIES = libpoppler.la
 
@@ -95,7 +96,8 @@
 	$(libjpeg_libs)				\
 	$(zlib_libs)				\
 	$(FREETYPE_LIBS)			\
-	$(FONTCONFIG_LIBS)
+	$(FONTCONFIG_LIBS)			\
+	$(LIBXML_LIBS)
 
 libpoppler_la_LDFLAGS = -version-info 1:0:0
 
@@ -154,6 +156,7 @@
 	NameToUnicodeTable.h	\
 	PSOutputDev.h		\
 	TextOutputDev.h		\
+	ABWOutputDev.h		\
 	SecurityHandler.h	\
 	UGooString.h		\
 	UTF8.h			\
@@ -209,6 +212,7 @@
 	PSOutputDev.cc		\
 	TextOutputDev.cc	\
 	PageLabelInfo.h		\
+	ABWOutputDev.cc	\
 	PageLabelInfo.cc	\
 	SecurityHandler.cc	\
 	UGooString.cc	 	\
diff -urN -x CVS poppler-abiword-no/utils/Makefile.am poppler-abiword/utils/Makefile.am
--- poppler-abiword-no/utils/Makefile.am	2007-04-01 14:30:01.561590672 -0400
+++ poppler-abiword/utils/Makefile.am	2007-03-31 11:35:09.000000000 -0400
@@ -15,7 +15,9 @@
 	-I$(top_srcdir)/utils			\
 	-I$(top_srcdir)/poppler			\
 	$(UTILS_CFLAGS)				\
-	$(FONTCONFIG_CFLAGS)
+	$(FONTCONFIG_CFLAGS)			\
+	$(LIBXML_CFLAGS)
+	
 
 LDADD =						\
 	$(top_builddir)/poppler/libpoppler.la	\
@@ -29,6 +31,7 @@
 	pdftops					\
 	pdftotext				\
 	pdftohtml				\
+	pdftoabw				\		
 	$(pdftoppm_binary)
 
 dist_man1_MANS =				\
@@ -74,6 +77,10 @@
 	HtmlOutputDev.h				\
 	$(common)
 
+pdftoabw_SOURCES =				\
+	pdftoabw.cc				\
+	$(common)
+
 # Yay, automake!  It should be able to figure out that it has to dist
 # pdftoppm.1, but nooo.  So we just add it here.
 
diff -urN -x CVS poppler-abiword-no/utils/pdftoabw.cc poppler-abiword/utils/pdftoabw.cc
--- poppler-abiword-no/utils/pdftoabw.cc	1969-12-31 19:00:00.000000000 -0500
+++ poppler-abiword/utils/pdftoabw.cc	2007-03-31 13:30:54.000000000 -0400
@@ -0,0 +1,138 @@
+//========================================================================
+//
+// pdftohtml.cc
+//
+//
+// Copyright 1999-2000 G. Ovtcharov
+//========================================================================
+
+#include "config.h"
+#include <poppler-config.h>
+#include <stdio.h>
+#include <stdlib.h>
+#include <stddef.h>
+#include <string.h>
+#include <dirent.h>
+#include <time.h>
+#include "parseargs.h"
+#include "goo/GooString.h"
+#include "goo/gmem.h"
+#include "Object.h"
+#include "Stream.h"
+#include "Array.h"
+#include "Dict.h"
+#include "XRef.h"
+#include "Catalog.h"
+#include "Page.h"
+#include "PDFDoc.h"
+#include "ABWOutputDev.h"
+#include "PSOutputDev.h"
+#include "GlobalParams.h"
+#include "Error.h"
+#include "UGooString.h"
+#include "goo/gfile.h"
+#include <libxml/parser.h>
+#include <libxml/tree.h>
+
+
+static int firstPage = 1;
+static int lastPage = 0;
+GBool printCommands = gTrue;
+GBool prettyPrint = gFalse;
+static GBool printHelp = gFalse;
+GBool stout=gFalse;
+
+static char ownerPassword[33] = "";
+static char userPassword[33] = "";
+
+static GooString* getInfoString(Dict *infoDict, char *key);
+static GooString* getInfoDate(Dict *infoDict, char *key);
+
+xmlDocPtr XMLdoc;
+
+static char textEncName[128] = "";
+
+static ArgDesc argDesc[] = {
+  {"-f",      argInt,      &firstPage,     0,
+   "first page to convert"},
+  {"-l",      argInt,      &lastPage,      0,
+   "last page to convert"},
+  {"-h",      argFlag,     &printHelp,     0,
+   "print usage information"},
+  {"--help",   argFlag,     &printHelp,     0,
+   "print usage information"},
+  {"--format",   argFlag,     &prettyPrint,     0,
+   "print usage information"},
+  {"--stdout"  ,argFlag,    &stout,         0,
+   "use standard output"},
+  {"--opw",    argString,   ownerPassword,  sizeof(ownerPassword),
+   "owner password (for encrypted files)"},
+  {"--upw",    argString,   userPassword,   sizeof(userPassword),
+   "user password (for encrypted files)"},
+  {NULL}
+};
+
+int main(int argc, char *argv[]) {
+  PDFDoc *doc = NULL;
+  GooString *fileName = NULL;
+  GooString *docTitle = NULL;
+  GooString *author = NULL, *keywords = NULL, *subject = NULL, *date = NULL;
+  GooString *htmlFileName = NULL;
+  GooString *psFileName = NULL;
+  ABWOutputDev *htmlOut = NULL;
+  PSOutputDev *psOut = NULL;
+  GBool ok;
+  char *p;
+  char extension[16] = "png";
+  GooString *ownerPW, *userPW;
+  Object info;
+  
+  char * outpName;
+
+  // parse args
+  parseArgs(argDesc, &argc, argv);
+  globalParams = new GlobalParams();
+
+  fileName = new GooString(argv[1]);
+  /*
+  if (stout){*/
+    outpName = "-";
+/*  }
+  else {
+    //FIXME: add outputfilename stuff
+  }
+  */
+  doc = new PDFDoc(fileName);
+  XMLdoc = xmlNewDoc(BAD_CAST "1.0");
+  htmlOut = new ABWOutputDev(XMLdoc);
+  htmlOut->setPDFDoc(doc);
+  /* check for copy permission
+  if (!doc->okToCopy()) {
+    error(-1, "Copying of text from this document is not allowed.");
+    goto error;
+  }*/
+
+  // write text file
+
+  if (lastPage == 0) lastPage = doc->getNumPages();
+
+  if (htmlOut->isOk())
+  {
+    doc->displayPages(htmlOut, 1, lastPage, 72, 72, 0, gTrue, gFalse, gFalse);
+		htmlOut->createABW();
+  }
+  xmlSaveFormatFileEnc(outpName, XMLdoc, "UTF-8", 1);
+  // clean up
+ error:
+  if(globalParams) delete globalParams;
+  //if(fileName) delete fileName;
+  if(doc) delete doc;
+  if(XMLdoc) xmlFreeDoc(XMLdoc);
+  if(htmlOut) delete htmlOut;
+  
+  // check for memory leaks
+  Object::memCheck(stderr);
+  gMemReport(stderr);
+
+  return 0;
+}


More information about the poppler mailing list