[poppler] utils/HtmlOutputDev.cc utils/pdf2xml.dtd

Albert Astals Cid aacid at kemper.freedesktop.org
Tue Nov 15 12:57:47 PST 2011


 utils/HtmlOutputDev.cc |   48 ++++++++++++++++++++++++++++++++++++++++--------
 utils/pdf2xml.dtd      |   22 +++++++++++++++++-----
 2 files changed, 57 insertions(+), 13 deletions(-)

New commits:
commit 65388b1aaf9a78efcf9486d5e2d4bdce76f11194
Author: Igor Slepchin <igor.slepchin at gmail.com>
Date:   Tue Nov 15 21:53:40 2011 +0100

    Output images in pdftohtml -xml mode if no -i option is specified.
    
    Comes with an attached update to pdf2xml.dtd

diff --git a/utils/HtmlOutputDev.cc b/utils/HtmlOutputDev.cc
index 43e4ec4..c1447ad 100644
--- a/utils/HtmlOutputDev.cc
+++ b/utils/HtmlOutputDev.cc
@@ -30,6 +30,7 @@
 // Copyright (C) 2010 OSSD CDAC Mumbai by Leena Chourey (leenac at cdacmumbai.in) and Onkar Potdar (onkar at cdacmumbai.in)
 // Copyright (C) 2011 Joshua Richardson <jric at chegg.com>
 // Copyright (C) 2011 Stephen Reichling <sreichling at chegg.com>
+// Copyright (C) 2011 Igor Slepchin <igor.slepchin at gmail.com>
 //
 // To see a description of the changes please see the Changelog file that
 // came with your tarball or type make ChangeLog if you are building from git
@@ -64,6 +65,21 @@
 
 #define DEBUG __FILE__ << ": " << __LINE__ << ": DEBUG: "
 
+class HtmlImage
+{
+public:
+    HtmlImage(GooString *_fName, GfxState *state)
+      : fName(_fName) {
+    state->transform(0, 0, &xMin, &yMax);
+    state->transform(1, 1, &xMax, &yMin);
+  }
+ ~HtmlImage() { delete fName; }
+
+  double xMin, xMax;		// image x coordinates
+  double yMin, yMax;		// image y coordinates
+  GooString  *fName;		// image file name
+};
+
 // returns true if x is closer to y than x is to z
 static inline bool IS_CLOSER(float x, float y, float z) { return fabs((x)-(y)) < fabs((x)-(z)); }
 
@@ -725,6 +741,15 @@ void HtmlPage::dumpAsXML(FILE* f,int page){
     delete fontCSStyle;
   }
   
+  int listlen=HtmlOutputDev::imgList->getLength();
+  for (int i = 0; i < listlen; i++) {
+    HtmlImage *img = (HtmlImage*)HtmlOutputDev::imgList->del(0);
+    fprintf(f,"<image top=\"%d\" left=\"%d\" ",xoutRound(img->yMin),xoutRound(img->xMin));
+    fprintf(f,"width=\"%d\" height=\"%d\" ",xoutRound(img->xMax-img->xMin),xoutRound(img->yMax-img->yMin));
+    fprintf(f,"src=\"%s\"/>\n",img->fName->getCString());
+    delete img;
+  }
+
   for(HtmlString *tmp=yxStrings;tmp;tmp=tmp->yxNext){
     if (tmp->htext){
       fprintf(f,"<text top=\"%d\" left=\"%d\" ",xoutRound(tmp->yMin),xoutRound(tmp->xMin));
@@ -864,9 +889,9 @@ void HtmlPage::dump(FILE *f, int pageNum)
     // Loop over the list of image names on this page
     int listlen=HtmlOutputDev::imgList->getLength();
     for (int i = 0; i < listlen; i++) {
-      GooString *fName= (GooString *)HtmlOutputDev::imgList->del(0);
-      fprintf(f,"<IMG src=\"%s\"/><br/>\n",fName->getCString());
-      delete fName;
+      HtmlImage *img = (HtmlImage*)HtmlOutputDev::imgList->del(0);
+      fprintf(f,"<IMG src=\"%s\"/><br/>\n",img->fName->getCString());
+      delete img;
     }
     HtmlOutputDev::imgNum=1;
 
@@ -1236,7 +1261,7 @@ void HtmlOutputDev::drawImageMask(GfxState *state, Object *ref, Stream *str,
 				  int width, int height, GBool invert,
 				  GBool interpolate, GBool inlineImg) {
 
-  if (ignore||complexMode) {
+  if (ignore||(complexMode && !xml)) {
     OutputDev::drawImageMask(state, ref, str, width, height, invert, interpolate, inlineImg);
     return;
   }
@@ -1272,7 +1297,10 @@ void HtmlOutputDev::drawImageMask(GfxState *state, Object *ref, Stream *str,
 
     fclose(f1);
    
-  if (fName) imgList->append(fName);
+    if (fName) {
+        HtmlImage *img = new HtmlImage(fName, state);
+        imgList->append(img);
+    }
   }
   else {
     OutputDev::drawImageMask(state, ref, str, width, height, invert, interpolate, inlineImg);
@@ -1283,7 +1311,7 @@ void HtmlOutputDev::drawImage(GfxState *state, Object *ref, Stream *str,
 			      int width, int height, GfxImageColorMap *colorMap,
 			      GBool interpolate, int *maskColors, GBool inlineImg) {
 
-  if (ignore||complexMode) {
+  if (ignore||(complexMode && !xml)) {
     OutputDev::drawImage(state, ref, str, width, height, colorMap, interpolate,
 			 maskColors, inlineImg);
     return;
@@ -1324,7 +1352,10 @@ void HtmlOutputDev::drawImage(GfxState *state, Object *ref, Stream *str,
     
     fclose(f1);
   
-    if (fName) imgList->append(fName);
+    if (fName) {
+        HtmlImage *img = new HtmlImage(fName, state);
+        imgList->append(img);
+    }
   }
   else {
 #ifdef ENABLE_LIBPNG
@@ -1390,7 +1421,8 @@ void HtmlOutputDev::drawImage(GfxState *state, Object *ref, Stream *str,
     fclose(f1);
 
     free(row);
-    imgList->append(fName);
+    HtmlImage *img = new HtmlImage(fName, state);
+    imgList->append(img);
     ++imgNum;
     imgStr->close();
     delete imgStr;
diff --git a/utils/pdf2xml.dtd b/utils/pdf2xml.dtd
index 1afa4fe..9cd3880 100644
--- a/utils/pdf2xml.dtd
+++ b/utils/pdf2xml.dtd
@@ -1,6 +1,6 @@
-<?xml version="1.0"?>
+<?xml version="1.0" encoding="UTF-8"?>
 <!ELEMENT pdf2xml (page+)>
-<!ELEMENT page (fontspec*, text*)>
+<!ELEMENT page (fontspec*, image*, text*)>
 <!ATTLIST page
 	number CDATA #REQUIRED
 	position CDATA #REQUIRED
@@ -16,7 +16,7 @@
 	family CDATA #REQUIRED
 	color CDATA #REQUIRED
 >
-<!ELEMENT text (#PCDATA | b | i)*>
+<!ELEMENT text (#PCDATA | b | i | a)*>
 <!ATTLIST text
 	top CDATA #REQUIRED
 	left CDATA #REQUIRED
@@ -24,5 +24,17 @@
 	height CDATA #REQUIRED
 	font CDATA #REQUIRED
 >
-<!ELEMENT b (#PCDATA)>
-<!ELEMENT i (#PCDATA)>
+<!ELEMENT b (#PCDATA | i)*>
+<!ELEMENT i (#PCDATA | b)*>
+<!ELEMENT a (#PCDATA)>
+<!ATTLIST a
+    href CDATA #REQUIRED
+>
+<!ELEMENT image EMPTY>
+<!ATTLIST image
+	top CDATA #REQUIRED
+	left CDATA #REQUIRED
+	width CDATA #REQUIRED
+	height CDATA #REQUIRED
+	src CDATA #REQUIRED
+>


More information about the poppler mailing list