[poppler] pdftohtml patch: new reflow option

Warren Toomey poppler at tuhs.org
Tue Sep 30 03:07:04 PDT 2008


Here is my controversial patch to utils/pdftohtml to alter the format of
output paragraphs. The program now has a command-line option "-reflow"
which enables the new output format.

I've rewritten the logic on lines 491-505 of utils/HtmlOutputDev.cc to be
a lot simpler. I have not yet checked the effect on complex mode or xml
output; shall do over the next few days. I have also updated the manual page.

Cheers,
	Warren
-------------- next part --------------
*** pdftohtml.cc	2008/09/30 09:06:27	1.2
--- pdftohtml.cc	2008/09/30 09:13:05
***************
*** 54,59 ****
--- 54,60 ----
  GBool noDrm=gFalse;
  
  GBool showHidden = gFalse;
+ GBool reFlow = gFalse;		// Output "reflow" paragraphs
  GBool noMerge = gFalse;
  static char ownerPassword[33] = "";
  static char userPassword[33] = "";
***************
*** 92,97 ****
--- 93,100 ----
     "zoom the pdf document (default 1.5)"},
    {"-xml",    argFlag,    &xml,         0,
     "output for XML post-processing"},
+   {"-reflow", argFlag,   &reFlow,   0,
+    "output reflow paragraphs"},
    {"-hidden", argFlag,   &showHidden,   0,
     "output hidden text"},
    {"-nomerge", argFlag, &noMerge, 0,
*** pdftohtml.1	2008/09/30 09:06:27	1.2
--- pdftohtml.1	2008/09/30 09:17:37
***************
*** 52,57 ****
--- 52,62 ----
  .B \-zoom <fp>
  zoom the pdf document (default 1.5)
  .TP
+ .B \-reflow
+ join paragraph lines together and separate paragraphs with a <p> tag. With
+ this flag off, paragraph lines are separated by <br> tags and paragraphs are
+ also separated by <br> tags.
+ .TP
  .B \-xml
  output for XML post-processing
  .TP
***************
*** 71,77 ****
  output device name for Ghostscript (png16m, jpeg etc)
  .TP
  .B \-nomerge
! do not merge paragraphs
  .TP
  .B \-nodrm
  override document DRM settings
--- 76,82 ----
  output device name for Ghostscript (png16m, jpeg etc)
  .TP
  .B \-nomerge
! separate output HTML lines with newline characters
  .TP
  .B \-nodrm
  override document DRM settings
*** HtmlOutputDev.cc	2008/09/30 08:51:18	1.2
--- HtmlOutputDev.cc	2008/09/30 09:53:47
***************
*** 48,53 ****
--- 48,54 ----
  extern GBool xml;
  extern GBool showHidden;
  extern GBool noMerge;
+ extern GBool reFlow;
  
  static GooString* basename(GooString* str){
    
***************
*** 376,385 ****
  void HtmlPage::coalesce() {
    HtmlString *str1, *str2;
    HtmlFont *hfont1, *hfont2;
!   double space, horSpace, vertSpace, vertOverlap;
!   GBool addSpace, addLineBreak;
    int n, i;
    double curX, curY;
  
  #if 0 //~ for debugging
    for (str1 = yxStrings; str1; str1 = str1->yxNext) {
--- 377,389 ----
  void HtmlPage::coalesce() {
    HtmlString *str1, *str2;
    HtmlFont *hfont1, *hfont2;
!   double space, horSpace, vertSpace;
!   GBool addSpace;
!   GBool nextLine;			// is str2 on the next line below?
!   GBool addNewline;			// should we output a newline?
    int n, i;
    double curX, curY;
+   double lineStartX=0.0;		// x-value of last line start
  
  #if 0 //~ for debugging
    for (str1 = yxStrings; str1; str1 = str1->yxNext) {
***************
*** 447,497 ****
      str1->htext->insert(0, ls);
      delete ls;
    }
!   curX = str1->xMin; curY = str1->yMin;
  
    while (str1 && (str2 = str1->yxNext)) {
      hfont2 = getFont(str2);
      space = str1->yMax - str1->yMin;
      horSpace = str2->xMin - str1->xMax;
!     addLineBreak = !noMerge && (fabs(str1->xMin - str2->xMin) < 0.4);
      vertSpace = str2->yMin - str1->yMax;
  
  //printf("coalesce %d %d %f? ", str1->dir, str2->dir, d);
  
!     if (str2->yMin >= str1->yMin && str2->yMin <= str1->yMax)
!     {
! 	vertOverlap = str1->yMax - str2->yMin;
!     } else
!     if (str2->yMax >= str1->yMin && str2->yMax <= str1->yMax)
!     {
! 	vertOverlap = str2->yMax - str1->yMin;
!     } else
!     {
!     	vertOverlap = 0;
!     } 
!     
!     if (
! 	(
! 	 (
! 	  (
! 	   (rawOrder && vertOverlap > 0.5 * space) 
! 	   ||
! 	   (!rawOrder && str2->yMin < str1->yMax)
! 	  ) &&
! 	  (horSpace > -0.5 * space && horSpace < space)
! 	 ) ||
!        	 (vertSpace >= 0 && vertSpace < 0.5 * space && addLineBreak)
! 	) &&
! 	(!complexMode || (hfont1->isEqualIgnoreBold(*hfont2))) && // in complex mode fonts must be the same, in other modes fonts do not metter
! 	str1->dir == str2->dir // text direction the same
!        ) 
      {
! //      printf("yes\n");
        n = str1->len + str2->len;
        if ((addSpace = horSpace > 0.1 * space)) {
          ++n;
        }
!       if (addLineBreak) {
          ++n;
        }
    
--- 451,515 ----
      str1->htext->insert(0, ls);
      delete ls;
    }
!   lineStartX= curX = str1->xMin; curY = str1->yMin;
  
    while (str1 && (str2 = str1->yxNext)) {
      hfont2 = getFont(str2);
      space = str1->yMax - str1->yMin;
      horSpace = str2->xMin - str1->xMax;
!     // Determine if str2 is on the line below the current line
!     addNewline=nextLine = !noMerge && (fabs(str2->yMin - str1->yMin) > 5.0);
      vertSpace = str2->yMin - str1->yMax;
  
+ #if 1
+     // Heuristic: if the last character in str1 is a hyphen,
+     // turn off addNewline. This will "glue" hyphenated words
+     // that have been split over multiple lines.
+     if (reFlow && str1->text[str1->len -1] == '-') {
+        addNewline=0;
+        // Also remove the hyphen
+        str1->len--;
+        str1->htext->del(str1->htext->getLength() - 1, 1);
+     }
+ #endif
+ 
  //printf("coalesce %d %d %f? ", str1->dir, str2->dir, d);
  
!     // Is str2 a new paragraph?
!     if (nextLine && (
!             // Is this an indented new line?
!             (str2->xMin > lineStartX + 3.0)
!             // Or is there a blank line between this and the last line?
!          || (vertSpace > 0.5 * space) ))
      {
!       // A new paragraph, so keep strings separate
! //    printf("new paragraph\n"); 
!       GBool finish_a = str1->getLink() != NULL;
!       GBool finish_bold   = hfont1->isBold();
!       GBool finish_italic = hfont1->isItalic();
!       CloseTags( str1->htext, finish_a, finish_italic, finish_bold );
!      
!       str1->xMin = curX; str1->yMin = curY; 
!       str1 = str2;
!       curX = str1->xMin; curY = str1->yMin;
!       lineStartX= str1->xMin;
!       hfont1 = hfont2;
!       if( hfont1->isBold() )
! 	str1->htext->insert(0,"<b>",3);
!       if( hfont1->isItalic() )
! 	str1->htext->insert(0,"<i>",3);
!       if( str1->getLink() != NULL ) {
! 	GooString *ls = str1->getLink()->getLinkStart();
! 	str1->htext->insert(0, ls);
! 	delete ls;
!       }
!     } else {
! //    printf("same paragraph\n");
        n = str1->len + str2->len;
        if ((addSpace = horSpace > 0.1 * space)) {
          ++n;
        }
!       if (nextLine) {
          ++n;
        }
    
***************
*** 502,519 ****
  					str1->size * sizeof(double));
        if (addSpace) {
  		  str1->text[str1->len] = 0x20;
! 		  str1->htext->append(xml?" ":"&nbsp;");
  		  str1->xRight[str1->len] = str2->xMin;
  		  ++str1->len;
        }
!       if (addLineBreak) {
! 	  str1->text[str1->len] = '\n';
! 	  str1->htext->append("<br>");
! 	  str1->xRight[str1->len] = str2->xMin;
! 	  ++str1->len;
  	  str1->yMin = str2->yMin;
  	  str1->yMax = str2->yMax;
  	  str1->xMax = str2->xMax;
  	  int fontLineSize = hfont1->getLineSize();
  	  int curLineSize = (int)(vertSpace + space); 
  	  if( curLineSize != fontLineSize )
--- 520,540 ----
  					str1->size * sizeof(double));
        if (addSpace) {
  		  str1->text[str1->len] = 0x20;
! 		  str1->htext->append((xml || reFlow) ? " " : "&nbsp;");
  		  str1->xRight[str1->len] = str2->xMin;
  		  ++str1->len;
        }
!       if (nextLine) {
!           if (addNewline) {
! 	    str1->text[str1->len] = '\n';
! 	    str1->htext->append(reFlow ? "\n" : "<br>");
! 	    str1->xRight[str1->len] = str2->xMin;
! 	    ++str1->len;
!           }
  	  str1->yMin = str2->yMin;
  	  str1->yMax = str2->yMax;
  	  str1->xMax = str2->xMax;
+ 	  lineStartX= str2->xMin;
  	  int fontLineSize = hfont1->getLineSize();
  	  int curLineSize = (int)(vertSpace + space); 
  	  if( curLineSize != fontLineSize )
***************
*** 565,590 ****
        }
        str1->yxNext = str2->yxNext;
        delete str2;
-     } else { // keep strings separate
- //      printf("no\n"); 
-       GBool finish_a = str1->getLink() != NULL;
-       GBool finish_bold   = hfont1->isBold();
-       GBool finish_italic = hfont1->isItalic();
-       CloseTags( str1->htext, finish_a, finish_italic, finish_bold );
-      
-       str1->xMin = curX; str1->yMin = curY; 
-       str1 = str2;
-       curX = str1->xMin; curY = str1->yMin;
-       hfont1 = hfont2;
-       if( hfont1->isBold() )
- 	str1->htext->insert(0,"<b>",3);
-       if( hfont1->isItalic() )
- 	str1->htext->insert(0,"<i>",3);
-       if( str1->getLink() != NULL ) {
- 	GooString *ls = str1->getLink()->getLinkStart();
- 	str1->htext->insert(0, ls);
- 	delete ls;
-       }
      }
    }
    str1->xMin = curX; str1->yMin = curY;
--- 586,591 ----
***************
*** 687,693 ****
    
    if( !noframes )
    {  
!       fputs("</HEAD>\n<BODY bgcolor=\"#A0A0A0\" vlink=\"blue\" link=\"blue\">\n",pageFile); 
    }
    
    if( !ignore ) 
--- 688,694 ----
    
    if( !noframes )
    {  
!       fputs("</HEAD>\n<BODY vlink=\"blue\" link=\"blue\">\n",pageFile); 
    }
    
    if( !ignore ) 
***************
*** 750,764 ****
      delete fName;
  
      GooString* str;
!     for(HtmlString *tmp=yxStrings;tmp;tmp=tmp->yxNext){
        if (tmp->htext){
  		str=new GooString(tmp->htext); 
  		fputs(str->getCString(),f);
  		delete str;      
! 		fputs("<br>\n",f);  
        }
      }
! 	fputs("<hr>\n",f);  
    }
  }
  
--- 751,765 ----
      delete fName;
  
      GooString* str;
!     for(HtmlString *tmp=yxStrings;tmp;tmp=tmp->yxNext) {
        if (tmp->htext){
  		str=new GooString(tmp->htext); 
  		fputs(str->getCString(),f);
  		delete str;      
! 		fputs(reFlow ? "<p>\n" : "<br>\n",f);  
        }
      }
!     if (!reFlow) fputs("<hr>\n",f);  
    }
  }
  
***************
*** 992,998 ****
        
        dumpMetaVars(page);
        fprintf(page,"</HEAD>\n");
!       fprintf(page,"<BODY bgcolor=\"#A0A0A0\" vlink=\"blue\" link=\"blue\">\n");
      }
    }
    ok = gTrue; 
--- 993,999 ----
        
        dumpMetaVars(page);
        fprintf(page,"</HEAD>\n");
!       fprintf(page,"<BODY vlink=\"blue\" link=\"blue\">\n");
      }
    }
    ok = gTrue; 
***************
*** 1498,1504 ****
  		if (noframes)
  		{
  			output = page; 
! 			fputs("<hr>\n", output);
  		}
  		else
  		{
--- 1499,1505 ----
  		if (noframes)
  		{
  			output = page; 
! 			if (!reFlow) fputs("<hr>\n", output);
  		}
  		else
  		{
***************
*** 1515,1521 ****
   
    	GBool done = newOutlineLevel(output, outlines, catalog);
    	if (done && !complexMode)
!     	fputs("<hr>\n", output);
  	
  	if (bClose)
  	{
--- 1516,1522 ----
   
    	GBool done = newOutlineLevel(output, outlines, catalog);
    	if (done && !complexMode)
!     	if (!reFlow) fputs("<hr>\n", output);
  	
  	if (bClose)
  	{


More information about the poppler mailing list