[poppler] utils/HtmlOutputDev.cc utils/pdf2xml.dtd
Albert Astals Cid
aacid at kemper.freedesktop.org
Tue Nov 15 12:57:47 PST 2011
utils/HtmlOutputDev.cc | 48 ++++++++++++++++++++++++++++++++++++++++--------
utils/pdf2xml.dtd | 22 +++++++++++++++++-----
2 files changed, 57 insertions(+), 13 deletions(-)
New commits:
commit 65388b1aaf9a78efcf9486d5e2d4bdce76f11194
Author: Igor Slepchin <igor.slepchin at gmail.com>
Date: Tue Nov 15 21:53:40 2011 +0100
Output images in pdftohtml -xml mode if no -i option is specified.
Comes with an attached update to pdf2xml.dtd
diff --git a/utils/HtmlOutputDev.cc b/utils/HtmlOutputDev.cc
index 43e4ec4..c1447ad 100644
--- a/utils/HtmlOutputDev.cc
+++ b/utils/HtmlOutputDev.cc
@@ -30,6 +30,7 @@
// Copyright (C) 2010 OSSD CDAC Mumbai by Leena Chourey (leenac at cdacmumbai.in) and Onkar Potdar (onkar at cdacmumbai.in)
// Copyright (C) 2011 Joshua Richardson <jric at chegg.com>
// Copyright (C) 2011 Stephen Reichling <sreichling at chegg.com>
+// Copyright (C) 2011 Igor Slepchin <igor.slepchin at gmail.com>
//
// To see a description of the changes please see the Changelog file that
// came with your tarball or type make ChangeLog if you are building from git
@@ -64,6 +65,21 @@
#define DEBUG __FILE__ << ": " << __LINE__ << ": DEBUG: "
+class HtmlImage
+{
+public:
+ HtmlImage(GooString *_fName, GfxState *state)
+ : fName(_fName) {
+ state->transform(0, 0, &xMin, &yMax);
+ state->transform(1, 1, &xMax, &yMin);
+ }
+ ~HtmlImage() { delete fName; }
+
+ double xMin, xMax; // image x coordinates
+ double yMin, yMax; // image y coordinates
+ GooString *fName; // image file name
+};
+
// returns true if x is closer to y than x is to z
static inline bool IS_CLOSER(float x, float y, float z) { return fabs((x)-(y)) < fabs((x)-(z)); }
@@ -725,6 +741,15 @@ void HtmlPage::dumpAsXML(FILE* f,int page){
delete fontCSStyle;
}
+ int listlen=HtmlOutputDev::imgList->getLength();
+ for (int i = 0; i < listlen; i++) {
+ HtmlImage *img = (HtmlImage*)HtmlOutputDev::imgList->del(0);
+ fprintf(f,"<image top=\"%d\" left=\"%d\" ",xoutRound(img->yMin),xoutRound(img->xMin));
+ fprintf(f,"width=\"%d\" height=\"%d\" ",xoutRound(img->xMax-img->xMin),xoutRound(img->yMax-img->yMin));
+ fprintf(f,"src=\"%s\"/>\n",img->fName->getCString());
+ delete img;
+ }
+
for(HtmlString *tmp=yxStrings;tmp;tmp=tmp->yxNext){
if (tmp->htext){
fprintf(f,"<text top=\"%d\" left=\"%d\" ",xoutRound(tmp->yMin),xoutRound(tmp->xMin));
@@ -864,9 +889,9 @@ void HtmlPage::dump(FILE *f, int pageNum)
// Loop over the list of image names on this page
int listlen=HtmlOutputDev::imgList->getLength();
for (int i = 0; i < listlen; i++) {
- GooString *fName= (GooString *)HtmlOutputDev::imgList->del(0);
- fprintf(f,"<IMG src=\"%s\"/><br/>\n",fName->getCString());
- delete fName;
+ HtmlImage *img = (HtmlImage*)HtmlOutputDev::imgList->del(0);
+ fprintf(f,"<IMG src=\"%s\"/><br/>\n",img->fName->getCString());
+ delete img;
}
HtmlOutputDev::imgNum=1;
@@ -1236,7 +1261,7 @@ void HtmlOutputDev::drawImageMask(GfxState *state, Object *ref, Stream *str,
int width, int height, GBool invert,
GBool interpolate, GBool inlineImg) {
- if (ignore||complexMode) {
+ if (ignore||(complexMode && !xml)) {
OutputDev::drawImageMask(state, ref, str, width, height, invert, interpolate, inlineImg);
return;
}
@@ -1272,7 +1297,10 @@ void HtmlOutputDev::drawImageMask(GfxState *state, Object *ref, Stream *str,
fclose(f1);
- if (fName) imgList->append(fName);
+ if (fName) {
+ HtmlImage *img = new HtmlImage(fName, state);
+ imgList->append(img);
+ }
}
else {
OutputDev::drawImageMask(state, ref, str, width, height, invert, interpolate, inlineImg);
@@ -1283,7 +1311,7 @@ void HtmlOutputDev::drawImage(GfxState *state, Object *ref, Stream *str,
int width, int height, GfxImageColorMap *colorMap,
GBool interpolate, int *maskColors, GBool inlineImg) {
- if (ignore||complexMode) {
+ if (ignore||(complexMode && !xml)) {
OutputDev::drawImage(state, ref, str, width, height, colorMap, interpolate,
maskColors, inlineImg);
return;
@@ -1324,7 +1352,10 @@ void HtmlOutputDev::drawImage(GfxState *state, Object *ref, Stream *str,
fclose(f1);
- if (fName) imgList->append(fName);
+ if (fName) {
+ HtmlImage *img = new HtmlImage(fName, state);
+ imgList->append(img);
+ }
}
else {
#ifdef ENABLE_LIBPNG
@@ -1390,7 +1421,8 @@ void HtmlOutputDev::drawImage(GfxState *state, Object *ref, Stream *str,
fclose(f1);
free(row);
- imgList->append(fName);
+ HtmlImage *img = new HtmlImage(fName, state);
+ imgList->append(img);
++imgNum;
imgStr->close();
delete imgStr;
diff --git a/utils/pdf2xml.dtd b/utils/pdf2xml.dtd
index 1afa4fe..9cd3880 100644
--- a/utils/pdf2xml.dtd
+++ b/utils/pdf2xml.dtd
@@ -1,6 +1,6 @@
-<?xml version="1.0"?>
+<?xml version="1.0" encoding="UTF-8"?>
<!ELEMENT pdf2xml (page+)>
-<!ELEMENT page (fontspec*, text*)>
+<!ELEMENT page (fontspec*, image*, text*)>
<!ATTLIST page
number CDATA #REQUIRED
position CDATA #REQUIRED
@@ -16,7 +16,7 @@
family CDATA #REQUIRED
color CDATA #REQUIRED
>
-<!ELEMENT text (#PCDATA | b | i)*>
+<!ELEMENT text (#PCDATA | b | i | a)*>
<!ATTLIST text
top CDATA #REQUIRED
left CDATA #REQUIRED
@@ -24,5 +24,17 @@
height CDATA #REQUIRED
font CDATA #REQUIRED
>
-<!ELEMENT b (#PCDATA)>
-<!ELEMENT i (#PCDATA)>
+<!ELEMENT b (#PCDATA | i)*>
+<!ELEMENT i (#PCDATA | b)*>
+<!ELEMENT a (#PCDATA)>
+<!ATTLIST a
+ href CDATA #REQUIRED
+>
+<!ELEMENT image EMPTY>
+<!ATTLIST image
+ top CDATA #REQUIRED
+ left CDATA #REQUIRED
+ width CDATA #REQUIRED
+ height CDATA #REQUIRED
+ src CDATA #REQUIRED
+>
More information about the poppler
mailing list