[poppler] pdftohtml patch: new reflow option
Warren Toomey
poppler at tuhs.org
Tue Sep 30 03:07:04 PDT 2008
Here is my controversial patch to utils/pdftohtml to alter the format of
output paragraphs. The program now has a command-line option "-reflow"
which enables the new output format.
I've rewritten the logic on lines 491-505 of utils/HtmlOutputDev.cc to be
a lot simpler. I have not yet checked the effect on complex mode or xml
output; shall do over the next few days. I have also updated the manual page.
Cheers,
Warren
-------------- next part --------------
*** pdftohtml.cc 2008/09/30 09:06:27 1.2
--- pdftohtml.cc 2008/09/30 09:13:05
***************
*** 54,59 ****
--- 54,60 ----
GBool noDrm=gFalse;
GBool showHidden = gFalse;
+ GBool reFlow = gFalse; // Output "reflow" paragraphs
GBool noMerge = gFalse;
static char ownerPassword[33] = "";
static char userPassword[33] = "";
***************
*** 92,97 ****
--- 93,100 ----
"zoom the pdf document (default 1.5)"},
{"-xml", argFlag, &xml, 0,
"output for XML post-processing"},
+ {"-reflow", argFlag, &reFlow, 0,
+ "output reflow paragraphs"},
{"-hidden", argFlag, &showHidden, 0,
"output hidden text"},
{"-nomerge", argFlag, &noMerge, 0,
*** pdftohtml.1 2008/09/30 09:06:27 1.2
--- pdftohtml.1 2008/09/30 09:17:37
***************
*** 52,57 ****
--- 52,62 ----
.B \-zoom <fp>
zoom the pdf document (default 1.5)
.TP
+ .B \-reflow
+ join paragraph lines together and separate paragraphs with a <p> tag. With
+ this flag off, paragraph lines are separated by <br> tags and paragraphs are
+ also separated by <br> tags.
+ .TP
.B \-xml
output for XML post-processing
.TP
***************
*** 71,77 ****
output device name for Ghostscript (png16m, jpeg etc)
.TP
.B \-nomerge
! do not merge paragraphs
.TP
.B \-nodrm
override document DRM settings
--- 76,82 ----
output device name for Ghostscript (png16m, jpeg etc)
.TP
.B \-nomerge
! separate output HTML lines with newline characters
.TP
.B \-nodrm
override document DRM settings
*** HtmlOutputDev.cc 2008/09/30 08:51:18 1.2
--- HtmlOutputDev.cc 2008/09/30 09:53:47
***************
*** 48,53 ****
--- 48,54 ----
extern GBool xml;
extern GBool showHidden;
extern GBool noMerge;
+ extern GBool reFlow;
static GooString* basename(GooString* str){
***************
*** 376,385 ****
void HtmlPage::coalesce() {
HtmlString *str1, *str2;
HtmlFont *hfont1, *hfont2;
! double space, horSpace, vertSpace, vertOverlap;
! GBool addSpace, addLineBreak;
int n, i;
double curX, curY;
#if 0 //~ for debugging
for (str1 = yxStrings; str1; str1 = str1->yxNext) {
--- 377,389 ----
void HtmlPage::coalesce() {
HtmlString *str1, *str2;
HtmlFont *hfont1, *hfont2;
! double space, horSpace, vertSpace;
! GBool addSpace;
! GBool nextLine; // is str2 on the next line below?
! GBool addNewline; // should we output a newline?
int n, i;
double curX, curY;
+ double lineStartX=0.0; // x-value of last line start
#if 0 //~ for debugging
for (str1 = yxStrings; str1; str1 = str1->yxNext) {
***************
*** 447,497 ****
str1->htext->insert(0, ls);
delete ls;
}
! curX = str1->xMin; curY = str1->yMin;
while (str1 && (str2 = str1->yxNext)) {
hfont2 = getFont(str2);
space = str1->yMax - str1->yMin;
horSpace = str2->xMin - str1->xMax;
! addLineBreak = !noMerge && (fabs(str1->xMin - str2->xMin) < 0.4);
vertSpace = str2->yMin - str1->yMax;
//printf("coalesce %d %d %f? ", str1->dir, str2->dir, d);
! if (str2->yMin >= str1->yMin && str2->yMin <= str1->yMax)
! {
! vertOverlap = str1->yMax - str2->yMin;
! } else
! if (str2->yMax >= str1->yMin && str2->yMax <= str1->yMax)
! {
! vertOverlap = str2->yMax - str1->yMin;
! } else
! {
! vertOverlap = 0;
! }
!
! if (
! (
! (
! (
! (rawOrder && vertOverlap > 0.5 * space)
! ||
! (!rawOrder && str2->yMin < str1->yMax)
! ) &&
! (horSpace > -0.5 * space && horSpace < space)
! ) ||
! (vertSpace >= 0 && vertSpace < 0.5 * space && addLineBreak)
! ) &&
! (!complexMode || (hfont1->isEqualIgnoreBold(*hfont2))) && // in complex mode fonts must be the same, in other modes fonts do not metter
! str1->dir == str2->dir // text direction the same
! )
{
! // printf("yes\n");
n = str1->len + str2->len;
if ((addSpace = horSpace > 0.1 * space)) {
++n;
}
! if (addLineBreak) {
++n;
}
--- 451,515 ----
str1->htext->insert(0, ls);
delete ls;
}
! lineStartX= curX = str1->xMin; curY = str1->yMin;
while (str1 && (str2 = str1->yxNext)) {
hfont2 = getFont(str2);
space = str1->yMax - str1->yMin;
horSpace = str2->xMin - str1->xMax;
! // Determine if str2 is on the line below the current line
! addNewline=nextLine = !noMerge && (fabs(str2->yMin - str1->yMin) > 5.0);
vertSpace = str2->yMin - str1->yMax;
+ #if 1
+ // Heuristic: if the last character in str1 is a hyphen,
+ // turn off addNewline. This will "glue" hyphenated words
+ // that have been split over multiple lines.
+ if (reFlow && str1->text[str1->len -1] == '-') {
+ addNewline=0;
+ // Also remove the hyphen
+ str1->len--;
+ str1->htext->del(str1->htext->getLength() - 1, 1);
+ }
+ #endif
+
//printf("coalesce %d %d %f? ", str1->dir, str2->dir, d);
! // Is str2 a new paragraph?
! if (nextLine && (
! // Is this an indented new line?
! (str2->xMin > lineStartX + 3.0)
! // Or is there a blank line between this and the last line?
! || (vertSpace > 0.5 * space) ))
{
! // A new paragraph, so keep strings separate
! // printf("new paragraph\n");
! GBool finish_a = str1->getLink() != NULL;
! GBool finish_bold = hfont1->isBold();
! GBool finish_italic = hfont1->isItalic();
! CloseTags( str1->htext, finish_a, finish_italic, finish_bold );
!
! str1->xMin = curX; str1->yMin = curY;
! str1 = str2;
! curX = str1->xMin; curY = str1->yMin;
! lineStartX= str1->xMin;
! hfont1 = hfont2;
! if( hfont1->isBold() )
! str1->htext->insert(0,"<b>",3);
! if( hfont1->isItalic() )
! str1->htext->insert(0,"<i>",3);
! if( str1->getLink() != NULL ) {
! GooString *ls = str1->getLink()->getLinkStart();
! str1->htext->insert(0, ls);
! delete ls;
! }
! } else {
! // printf("same paragraph\n");
n = str1->len + str2->len;
if ((addSpace = horSpace > 0.1 * space)) {
++n;
}
! if (nextLine) {
++n;
}
***************
*** 502,519 ****
str1->size * sizeof(double));
if (addSpace) {
str1->text[str1->len] = 0x20;
! str1->htext->append(xml?" ":" ");
str1->xRight[str1->len] = str2->xMin;
++str1->len;
}
! if (addLineBreak) {
! str1->text[str1->len] = '\n';
! str1->htext->append("<br>");
! str1->xRight[str1->len] = str2->xMin;
! ++str1->len;
str1->yMin = str2->yMin;
str1->yMax = str2->yMax;
str1->xMax = str2->xMax;
int fontLineSize = hfont1->getLineSize();
int curLineSize = (int)(vertSpace + space);
if( curLineSize != fontLineSize )
--- 520,540 ----
str1->size * sizeof(double));
if (addSpace) {
str1->text[str1->len] = 0x20;
! str1->htext->append((xml || reFlow) ? " " : " ");
str1->xRight[str1->len] = str2->xMin;
++str1->len;
}
! if (nextLine) {
! if (addNewline) {
! str1->text[str1->len] = '\n';
! str1->htext->append(reFlow ? "\n" : "<br>");
! str1->xRight[str1->len] = str2->xMin;
! ++str1->len;
! }
str1->yMin = str2->yMin;
str1->yMax = str2->yMax;
str1->xMax = str2->xMax;
+ lineStartX= str2->xMin;
int fontLineSize = hfont1->getLineSize();
int curLineSize = (int)(vertSpace + space);
if( curLineSize != fontLineSize )
***************
*** 565,590 ****
}
str1->yxNext = str2->yxNext;
delete str2;
- } else { // keep strings separate
- // printf("no\n");
- GBool finish_a = str1->getLink() != NULL;
- GBool finish_bold = hfont1->isBold();
- GBool finish_italic = hfont1->isItalic();
- CloseTags( str1->htext, finish_a, finish_italic, finish_bold );
-
- str1->xMin = curX; str1->yMin = curY;
- str1 = str2;
- curX = str1->xMin; curY = str1->yMin;
- hfont1 = hfont2;
- if( hfont1->isBold() )
- str1->htext->insert(0,"<b>",3);
- if( hfont1->isItalic() )
- str1->htext->insert(0,"<i>",3);
- if( str1->getLink() != NULL ) {
- GooString *ls = str1->getLink()->getLinkStart();
- str1->htext->insert(0, ls);
- delete ls;
- }
}
}
str1->xMin = curX; str1->yMin = curY;
--- 586,591 ----
***************
*** 687,693 ****
if( !noframes )
{
! fputs("</HEAD>\n<BODY bgcolor=\"#A0A0A0\" vlink=\"blue\" link=\"blue\">\n",pageFile);
}
if( !ignore )
--- 688,694 ----
if( !noframes )
{
! fputs("</HEAD>\n<BODY vlink=\"blue\" link=\"blue\">\n",pageFile);
}
if( !ignore )
***************
*** 750,764 ****
delete fName;
GooString* str;
! for(HtmlString *tmp=yxStrings;tmp;tmp=tmp->yxNext){
if (tmp->htext){
str=new GooString(tmp->htext);
fputs(str->getCString(),f);
delete str;
! fputs("<br>\n",f);
}
}
! fputs("<hr>\n",f);
}
}
--- 751,765 ----
delete fName;
GooString* str;
! for(HtmlString *tmp=yxStrings;tmp;tmp=tmp->yxNext) {
if (tmp->htext){
str=new GooString(tmp->htext);
fputs(str->getCString(),f);
delete str;
! fputs(reFlow ? "<p>\n" : "<br>\n",f);
}
}
! if (!reFlow) fputs("<hr>\n",f);
}
}
***************
*** 992,998 ****
dumpMetaVars(page);
fprintf(page,"</HEAD>\n");
! fprintf(page,"<BODY bgcolor=\"#A0A0A0\" vlink=\"blue\" link=\"blue\">\n");
}
}
ok = gTrue;
--- 993,999 ----
dumpMetaVars(page);
fprintf(page,"</HEAD>\n");
! fprintf(page,"<BODY vlink=\"blue\" link=\"blue\">\n");
}
}
ok = gTrue;
***************
*** 1498,1504 ****
if (noframes)
{
output = page;
! fputs("<hr>\n", output);
}
else
{
--- 1499,1505 ----
if (noframes)
{
output = page;
! if (!reFlow) fputs("<hr>\n", output);
}
else
{
***************
*** 1515,1521 ****
GBool done = newOutlineLevel(output, outlines, catalog);
if (done && !complexMode)
! fputs("<hr>\n", output);
if (bClose)
{
--- 1516,1522 ----
GBool done = newOutlineLevel(output, outlines, catalog);
if (done && !complexMode)
! if (!reFlow) fputs("<hr>\n", output);
if (bClose)
{
More information about the poppler
mailing list