[poppler] pdftohtml patch: new reflow option (fixed)

Warren Toomey poppler at tuhs.org
Tue Sep 30 23:49:35 PDT 2008


This patch add the -reflow option to pdftohtml which produces nicer
HTML output, and replaces the broken patch that I submitted as
http://lists.freedesktop.org/archives/poppler/2008-September/004115.html
The patch has no significant effect on -complex and -xml mode, and -nomerge
still works. The patch also fixes this bugzilla bug for poppler:
https://bugs.freedesktop.org/show_bug.cgi?id=12522

Thanks,
	Warren
-------------- next part --------------
--- HtmlOutputDev.cc	Wed Oct  1 15:38:03 2008
+++ HtmlOutputDev.cc	Wed Oct  1 15:38:14 2008
@@ -53,6 +53,7 @@
 extern GBool xml;
 extern GBool showHidden;
 extern GBool noMerge;
+extern GBool reFlow;
 
 static GooString* basename(GooString* str){
   
@@ -381,10 +382,13 @@
 void HtmlPage::coalesce() {
   HtmlString *str1, *str2;
   HtmlFont *hfont1, *hfont2;
-  double space, horSpace, vertSpace, vertOverlap;
-  GBool addSpace, addLineBreak;
+  double space, horSpace, vertSpace;
+  GBool addSpace;
+  GBool nextLine;			// is str2 on the next line below?
+  GBool addNewline;			// should we output a newline?
   int n, i;
   double curX, curY;
+  double lineStartX=0.0;		// x-value of last line start
 
 #if 0 //~ for debugging
   for (str1 = yxStrings; str1; str1 = str1->yxNext) {
@@ -452,51 +456,65 @@
     str1->htext->insert(0, ls);
     delete ls;
   }
-  curX = str1->xMin; curY = str1->yMin;
+  lineStartX= curX = str1->xMin; curY = str1->yMin;
 
   while (str1 && (str2 = str1->yxNext)) {
     hfont2 = getFont(str2);
     space = str1->yMax - str1->yMin;
     horSpace = str2->xMin - str1->xMax;
-    addLineBreak = !noMerge && (fabs(str1->xMin - str2->xMin) < 0.4);
+    // Determine if str2 is on the line below the current line
+    addNewline=nextLine = !noMerge && (fabs(str2->yMin - str1->yMin) > 5.0);
     vertSpace = str2->yMin - str1->yMax;
 
+    // Heuristic: if the last character in str1 is a hyphen,
+    // turn off addNewline. This will "glue" hyphenated words
+    // that have been split over multiple lines.
+    if (reFlow && str1->text[str1->len -1] == '-') {
+       addNewline=0;
+       // Also remove the hyphen
+       str1->len--;
+       str1->htext->del(str1->htext->getLength() - 1, 1);
+    }
+
 //printf("coalesce %d %d %f? ", str1->dir, str2->dir, d);
 
-    if (str2->yMin >= str1->yMin && str2->yMin <= str1->yMax)
-    {
-	vertOverlap = str1->yMax - str2->yMin;
-    } else
-    if (str2->yMax >= str1->yMin && str2->yMax <= str1->yMax)
-    {
-	vertOverlap = str2->yMax - str1->yMin;
-    } else
+    // Is str2 a new paragraph?
+    if (nextLine && (
+            // Is this an indented new line?
+            (str2->xMin > lineStartX + 3.0)
+            // Or is there a blank line between this and the last line?
+         || (vertSpace > 0.5 * space)
+	    // Or it is XML output, so we always separate each line
+	 || xml ))
     {
-    	vertOverlap = 0;
-    } 
-    
-    if (
-	(
-	 (
-	  (
-	   (rawOrder && vertOverlap > 0.5 * space) 
-	   ||
-	   (!rawOrder && str2->yMin < str1->yMax)
-	  ) &&
-	  (horSpace > -0.5 * space && horSpace < space)
-	 ) ||
-       	 (vertSpace >= 0 && vertSpace < 0.5 * space && addLineBreak)
-	) &&
-	(!complexMode || (hfont1->isEqualIgnoreBold(*hfont2))) && // in complex mode fonts must be the same, in other modes fonts do not metter
-	str1->dir == str2->dir // text direction the same
-       ) 
-    {
-//      printf("yes\n");
+      // A new paragraph, so keep strings separate
+//    printf("new paragraph\n"); 
+      GBool finish_a = str1->getLink() != NULL;
+      GBool finish_bold   = hfont1->isBold();
+      GBool finish_italic = hfont1->isItalic();
+      CloseTags( str1->htext, finish_a, finish_italic, finish_bold );
+     
+      str1->xMin = curX; str1->yMin = curY; 
+      str1 = str2;
+      curX = str1->xMin; curY = str1->yMin;
+      lineStartX= str1->xMin;
+      hfont1 = hfont2;
+      if( hfont1->isBold() )
+	str1->htext->insert(0,"<b>",3);
+      if( hfont1->isItalic() )
+	str1->htext->insert(0,"<i>",3);
+      if( str1->getLink() != NULL ) {
+	GooString *ls = str1->getLink()->getLinkStart();
+	str1->htext->insert(0, ls);
+	delete ls;
+      }
+    } else {
+//    printf("same paragraph\n");
       n = str1->len + str2->len;
       if ((addSpace = horSpace > 0.1 * space)) {
         ++n;
       }
-      if (addLineBreak) {
+      if (nextLine) {
         ++n;
       }
   
@@ -507,18 +525,21 @@
 					str1->size * sizeof(double));
       if (addSpace) {
 		  str1->text[str1->len] = 0x20;
-		  str1->htext->append(xml?" ":"&nbsp;");
+		  str1->htext->append((xml || reFlow) ? " " : "&nbsp;");
 		  str1->xRight[str1->len] = str2->xMin;
 		  ++str1->len;
       }
-      if (addLineBreak) {
-	  str1->text[str1->len] = '\n';
-	  str1->htext->append("<br>");
-	  str1->xRight[str1->len] = str2->xMin;
-	  ++str1->len;
+      if (nextLine) {
+          if (addNewline) {
+	    str1->text[str1->len] = '\n';
+	    str1->htext->append(reFlow ? "\n" : "<br>");
+	    str1->xRight[str1->len] = str2->xMin;
+	    ++str1->len;
+          }
 	  str1->yMin = str2->yMin;
 	  str1->yMax = str2->yMax;
 	  str1->xMax = str2->xMax;
+	  lineStartX= str2->xMin;
 	  int fontLineSize = hfont1->getLineSize();
 	  int curLineSize = (int)(vertSpace + space); 
 	  if( curLineSize != fontLineSize )
@@ -570,26 +591,6 @@
       }
       str1->yxNext = str2->yxNext;
       delete str2;
-    } else { // keep strings separate
-//      printf("no\n"); 
-      GBool finish_a = str1->getLink() != NULL;
-      GBool finish_bold   = hfont1->isBold();
-      GBool finish_italic = hfont1->isItalic();
-      CloseTags( str1->htext, finish_a, finish_italic, finish_bold );
-     
-      str1->xMin = curX; str1->yMin = curY; 
-      str1 = str2;
-      curX = str1->xMin; curY = str1->yMin;
-      hfont1 = hfont2;
-      if( hfont1->isBold() )
-	str1->htext->insert(0,"<b>",3);
-      if( hfont1->isItalic() )
-	str1->htext->insert(0,"<i>",3);
-      if( str1->getLink() != NULL ) {
-	GooString *ls = str1->getLink()->getLinkStart();
-	str1->htext->insert(0, ls);
-	delete ls;
-      }
     }
   }
   str1->xMin = curX; str1->yMin = curY;
@@ -692,7 +693,7 @@
   
   if( !noframes )
   {  
-      fputs("</HEAD>\n<BODY bgcolor=\"#A0A0A0\" vlink=\"blue\" link=\"blue\">\n",pageFile); 
+      fputs("</HEAD>\n<BODY vlink=\"blue\" link=\"blue\">\n",pageFile); 
   }
   
   if( !ignore ) 
@@ -760,10 +761,10 @@
 		str=new GooString(tmp->htext); 
 		fputs(str->getCString(),f);
 		delete str;      
-		fputs("<br>\n",f);  
+		fputs(reFlow ? "<p>\n" : "<br>\n",f);  
       }
     }
-	fputs("<hr>\n",f);  
+    if (!reFlow) fputs("<hr>\n",f);  
   }
 }
 
@@ -997,7 +998,7 @@
       
       dumpMetaVars(page);
       fprintf(page,"</HEAD>\n");
-      fprintf(page,"<BODY bgcolor=\"#A0A0A0\" vlink=\"blue\" link=\"blue\">\n");
+      fprintf(page,"<BODY vlink=\"blue\" link=\"blue\">\n");
     }
   }
   ok = gTrue; 
@@ -1444,11 +1445,11 @@
 	      GooString *str=GooString::fromInt(page);
 	      /* 		complex 	simple
 	       	frames		file-4.html	files.html#4
-		noframes	file.html#4	file.html#4
+		noframes	#4		#4
 	       */
 	      if (noframes)
 	      {
-		  file->append(".html#");
+		  file= new GooString("#");
 		  file->append(str);
 	      }
 	      else
@@ -1566,7 +1567,7 @@
 		if (noframes)
 		{
 			output = page; 
-			fputs("<hr>\n", output);
+			if (!reFlow) fputs("<hr>\n", output);
 		}
 		else
 		{
@@ -1583,7 +1584,7 @@
  
   	GBool done = newOutlineLevel(output, outlines, catalog);
   	if (done && !complexMode)
-    	fputs("<hr>\n", output);
+    	if (!reFlow) fputs("<hr>\n", output);
 	
 	if (bClose)
 	{
--- pdftohtml.cc	2008/09/30 00:18:37	1.1
+++ pdftohtml.cc	2008/10/01 05:47:12
@@ -54,6 +54,7 @@
 GBool noDrm=gFalse;
 
 GBool showHidden = gFalse;
+GBool reFlow = gFalse;		// Output "reflow" paragraphs
 GBool noMerge = gFalse;
 static char ownerPassword[33] = "";
 static char userPassword[33] = "";
@@ -92,12 +93,14 @@
    "zoom the pdf document (default 1.5)"},
   {"-xml",    argFlag,    &xml,         0,
    "output for XML post-processing"},
+  {"-reflow", argFlag,   &reFlow,   0,
+   "output reflow paragraphs"},
   {"-hidden", argFlag,   &showHidden,   0,
    "output hidden text"},
   {"-nomerge", argFlag, &noMerge, 0,
    "do not merge paragraphs"},   
   {"-enc",    argString,   textEncName,    sizeof(textEncName),
-   "output text encoding name"},
+   "output text encoding name (UTF-8, Latin1 etc)"},
   {"-dev",    argString,   gsDevice,       sizeof(gsDevice),
    "output device name for Ghostscript (png16m, jpeg etc)"},
   {"-v",      argFlag,     &printVersion,  0,
@@ -234,7 +237,7 @@
    { 
        complexMode = gTrue;
        noframes = gTrue;
-       noMerge = gTrue;
+       noMerge = gFalse;
    }
 
   // get page range
--- pdftohtml.1	2008/09/30 00:18:37	1.1
+++ pdftohtml.1	2008/10/01 05:42:30
@@ -52,11 +52,16 @@
 .B \-zoom <fp>
 zoom the pdf document (default 1.5)
 .TP
+.B \-reflow
+join paragraph lines together and separate paragraphs with a <p> tag. With
+this flag off, paragraph lines are separated by <br> tags and paragraphs are
+also separated by <br> tags.
+.TP
 .B \-xml
 output for XML post-processing
 .TP
 .B \-enc <string>
-output text encoding name
+output text encoding name (UTF-8, Latin1 etc)
 .TP
 .B \-opw <string>
 owner password (for encrypted files)
@@ -71,7 +76,7 @@
 output device name for Ghostscript (png16m, jpeg etc)
 .TP
 .B \-nomerge
-do not merge paragraphs
+separate output HTML lines with newline characters
 .TP
 .B \-nodrm
 override document DRM settings


More information about the poppler mailing list