[poppler] utils/HtmlFonts.cc utils/HtmlFonts.h utils/HtmlOutputDev.cc utils/HtmlOutputDev.h utils/HtmlUtils.h

Albert Astals Cid aacid at kemper.freedesktop.org
Thu Aug 18 09:49:26 PDT 2011


 utils/HtmlFonts.cc     |   64 ++++++++++--------
 utils/HtmlFonts.h      |   10 ++
 utils/HtmlOutputDev.cc |  170 ++++++++++++++++++++++++++++++++++---------------
 utils/HtmlOutputDev.h  |    8 +-
 utils/HtmlUtils.h      |   51 ++++++++++++++
 5 files changed, 218 insertions(+), 85 deletions(-)

New commits:
commit 86271e4810f714d4ba7a2a6651a9b1d04f653262
Author: Joshua Richardson <jric at chegg.com>
Date:   Thu Aug 18 18:48:40 2011 +0200

    pdftohtml: Support text rotation
    
    Includes a few other fixlets.
    See bug 38586 for more info

diff --git a/utils/HtmlFonts.cc b/utils/HtmlFonts.cc
index 2ae9222..c9b558e 100644
--- a/utils/HtmlFonts.cc
+++ b/utils/HtmlFonts.cc
@@ -21,7 +21,8 @@
 // Copyright (C) 2008 Boris Toloknov <tlknv at yandex.ru>
 // Copyright (C) 2008 Tomas Are Haavet <tomasare at gmail.com>
 // Copyright (C) 2010 OSSD CDAC Mumbai by Leena Chourey (leenac at cdacmumbai.in) and Onkar Potdar (onkar at cdacmumbai.in)
-// Copyright (C) 2011 Joshua Richardson <joshuarbox-junk1 at yahoo.com>
+// Copyright (C) 2011 Joshua Richardson <jric at chegg.com>
+// Copyright (C) 2011 Stephen Reichling <sreichling at chegg.com>
 //
 // To see a description of the changes please see the Changelog file that
 // came with your tarball or type make ChangeLog if you are building from git
@@ -29,6 +30,7 @@
 //========================================================================
 
 #include "HtmlFonts.h"
+#include "HtmlUtils.h"
 #include "GlobalParams.h"
 #include "UnicodeMap.h"
 #include <stdio.h>
@@ -120,6 +122,7 @@ HtmlFont::HtmlFont(GooString* ftname,int _size, GfxRGB rgb){
   size=(_size-1);
   italic = gFalse;
   bold = gFalse;
+  rotOrSkewed = gFalse;
 
   if (fontname){
     if (strstr(fontname->lowerCase()->getCString(),"bold"))  bold=gTrue;
@@ -148,6 +151,8 @@ HtmlFont::HtmlFont(const HtmlFont& x){
    pos=x.pos;
    color=x.color;
    if (x.FontName) FontName=new GooString(x.FontName);
+   rotOrSkewed = x.rotOrSkewed;
+   memcpy(rotSkewMat, x.rotSkewMat, sizeof(rotSkewMat));
  }
 
 
@@ -176,14 +181,15 @@ void HtmlFont::clear(){
 
 
 /*
-  This function is used to compare font uniquily for insertion into
+  This function is used to compare font uniquely for insertion into
   the list of all encountered fonts
 */
 GBool HtmlFont::isEqual(const HtmlFont& x) const{
-  return ((size==x.size) &&
+  return (size==x.size) &&
 	  (lineSize==x.lineSize) &&
 	  (pos==x.pos) && (bold==x.bold) && (italic==x.italic) &&
-	  (color.isEqual(x.getColor())));
+	  (color.isEqual(x.getColor())) && isRotOrSkewed() == x.isRotOrSkewed() &&
+	  (!isRotOrSkewed() || rot_matrices_equal(getRotMat(), x.getRotMat()));
 }
 
 /*
@@ -232,11 +238,11 @@ GooString* HtmlFont::HtmlFilter(Unicode* u, int uLen) {
   for (int i = 0; i < uLen; ++i) {
     switch (u[i])
       { 
-	case '"': tmp->append("&quot;");  break;
+	case '"': tmp->append("&#34;");  break;
 	case '&': tmp->append("&amp;");  break;
 	case '<': tmp->append("&lt;");  break;
 	case '>': tmp->append("&gt;");  break;
-	case ' ': tmp->append( !xml && ( i+1 >= uLen || !tmp->getLength() || tmp->getChar( tmp->getLength()-1 ) == ' ' ) ? "&nbsp;" : " " );
+	case ' ': tmp->append( !xml && ( i+1 >= uLen || !tmp->getLength() || tmp->getChar( tmp->getLength()-1 ) == ' ' ) ? "&#160;" : " " );
 	          break;
 	default:  
 	  {
@@ -289,29 +295,6 @@ int HtmlFontAccu::AddFont(const HtmlFont& font){
  return (accu->size()-1);
 }
 
-// get CSS font name for font #i 
-GooString* HtmlFontAccu::getCSStyle(int i, GooString* content, int j){
-  GooString *tmp;
-  GooString *iStr=GooString::fromInt(i);
-  GooString *jStr=GooString::fromInt(j);
-  
-  if (!xml) {
-    tmp = new GooString("<span class=\"ft");
-    tmp->append(jStr);
-    tmp->append(iStr);
-    tmp->append("\">");
-    tmp->append(content);
-    tmp->append("</span>");
-  } else {
-    tmp = new GooString("");
-    tmp->append(content);
-  }
-
-  delete jStr;
-  delete iStr;
-  return tmp;
-}
-
 // get CSS font definition for font #i 
 GooString* HtmlFontAccu::CSStyle(int i, int j){
    GooString *tmp=new GooString();
@@ -343,6 +326,29 @@ GooString* HtmlFontAccu::CSStyle(int i, int j){
      tmp->append(fontName); //font.getFontName());
      tmp->append(";color:");
      tmp->append(colorStr);
+     // if there is rotation or skew, include the matrix
+     if (font.isRotOrSkewed()) {
+    	 const double * const text_mat = font.getRotMat();
+    	 GooString matrix_str(" matrix(");
+    	 matrix_str.appendf("{0:10.10g}, {1:10.10g}, {2:10.10g}, {3:10.10g}, 0, 0)",
+    			 text_mat[0], text_mat[1], text_mat[2], text_mat[3]);
+    	 tmp->append(";-moz-transform:");
+    	 tmp->append(&matrix_str);
+    	 tmp->append(";-webkit-transform:");
+    	 tmp->append(&matrix_str);
+    	 tmp->append(";-o-transform:");
+    	 tmp->append(&matrix_str);
+    	 tmp->append(";-ms-transform:");
+    	 tmp->append(&matrix_str);
+    	 // Todo: 75% is a wild guess that seems to work pretty well;
+    	 // We probably need to calculate the real percentage
+    	 // Based on the characteristic baseline and bounding box of current font
+    	 // PDF origin is at baseline
+    	 tmp->append(";-moz-transform-origin: left 75%");
+    	 tmp->append(";-webkit-transform-origin: left 75%");
+    	 tmp->append(";-o-transform-origin: left 75%");
+    	 tmp->append(";-ms-transform-origin: left 75%");
+     }
      tmp->append(";}");
    }
    if (xml) {
diff --git a/utils/HtmlFonts.h b/utils/HtmlFonts.h
index 2cdea4b..3e3b028 100644
--- a/utils/HtmlFonts.h
+++ b/utils/HtmlFonts.h
@@ -20,6 +20,7 @@
 // Copyright (C) 2010 OSSD CDAC Mumbai by Leena Chourey (leenac at cdacmumbai.in) and Onkar Potdar (onkar at cdacmumbai.in)
 // Copyright (C) 2010 Albert Astals Cid <aacid at kde.org>
 // Copyright (C) 2011 Steven Murdoch <Steven.Murdoch at cl.cam.ac.uk>
+// Copyright (C) 2011 Joshua Richardson <jric at chegg.com>
 //
 // To see a description of the changes please see the Changelog file that
 // came with your tarball or type make ChangeLog if you are building from git
@@ -62,13 +63,15 @@ class HtmlFont{
    int lineSize;
    GBool italic;
    GBool bold;
+   GBool rotOrSkewed;
    int pos; // position of the font name in the fonts array
    static GooString *DefaultFont;
    GooString *FontName;
    HtmlFontColor color;
+   double rotSkewMat[4]; // only four values needed for rotation and skew
 public:  
 
-   HtmlFont(){FontName=NULL;};
+   HtmlFont(){FontName=NULL; rotOrSkewed = gFalse;}
    HtmlFont(GooString* fontname,int _size, GfxRGB rgb);
    HtmlFont(const HtmlFont& x);
    HtmlFont& operator=(const HtmlFont& x);
@@ -78,9 +81,13 @@ public:
    GooString* getFullName();
    GBool isItalic() const {return italic;}
    GBool isBold() const {return bold;}
+   GBool isRotOrSkewed() const { return rotOrSkewed; }
    unsigned int getSize() const {return size;}
    int getLineSize() const {return lineSize;}
    void setLineSize(int _lineSize) { lineSize = _lineSize; }
+   void setRotMat(const double * const mat)
+   { rotOrSkewed = gTrue; memcpy(rotSkewMat, mat, sizeof(rotSkewMat)); }
+   const double *getRotMat() const { return rotSkewMat; }
    GooString* getFontName();
    static GooString* getDefaultFont();
    static void setDefaultFont(GooString* defaultFont);
@@ -102,7 +109,6 @@ public:
   HtmlFont *Get(int i){
     return &(*accu)[i];
   } 
-  GooString* getCSStyle (int i,GooString* content, int j = 0);
   GooString* CSStyle(int i, int j = 0);
   int size() const {return accu->size();}
   
diff --git a/utils/HtmlOutputDev.cc b/utils/HtmlOutputDev.cc
index a71bb96..615cf5f 100644
--- a/utils/HtmlOutputDev.cc
+++ b/utils/HtmlOutputDev.cc
@@ -28,7 +28,8 @@
 // Copyright (C) 2010 Adrian Johnson <ajohnson at redneon.com>
 // Copyright (C) 2010 Hib Eris <hib at hiberis.nl>
 // Copyright (C) 2010 OSSD CDAC Mumbai by Leena Chourey (leenac at cdacmumbai.in) and Onkar Potdar (onkar at cdacmumbai.in)
-// Copyright (C) 2011 Joshua Richardson <joshuarbox-junk1 at yahoo.com>
+// Copyright (C) 2011 Joshua Richardson <jric at chegg.com>
+// Copyright (C) 2011 Stephen Reichling <sreichling at chegg.com>
 //
 // To see a description of the changes please see the Changelog file that
 // came with your tarball or type make ChangeLog if you are building from git
@@ -46,6 +47,7 @@
 #include <stddef.h>
 #include <ctype.h>
 #include <math.h>
+#include <iostream>
 #include "goo/GooString.h"
 #include "goo/GooList.h"
 #include "UnicodeMap.h"
@@ -61,6 +63,9 @@
 #include "GlobalParams.h"
 #include "HtmlOutputDev.h"
 #include "HtmlFonts.h"
+#include "HtmlUtils.h"
+
+#define DEBUG __FILE__ << ": " << __LINE__ << ": DEBUG: "
 
 // returns true if x is closer to y than x is to z
 static inline bool IS_CLOSER(float x, float y, float z) { return fabs((x)-(y)) < fabs((x)-(z)); }
@@ -80,6 +85,9 @@ extern GBool xml;
 extern GBool showHidden;
 extern GBool noMerge;
 
+static GBool debug = gFalse;
+static GooString *gstr_buff0 = NULL; // a workspace in which I format strings
+
 static GooString* basename(GooString* str){
   
   char *p=str->getCString();
@@ -102,11 +110,37 @@ static GooString* Dirname(GooString* str){
 } 
 #endif
 
+static const char *print_matrix(const double *mat) {
+  delete gstr_buff0;
+
+  gstr_buff0 =  GooString::format("[{0:g} {1:g} {2:g} {3:g} {4:g} {5:g}]",
+                                  *mat, mat[1], mat[2], mat[3], mat[4], mat[5]);
+  return gstr_buff0->getCString();
+}
+
+static const char *print_uni_str(const Unicode *u, const unsigned uLen) {
+  GooString *gstr_buff1 = NULL;
+
+  delete gstr_buff0;
+
+  if (!uLen) return "";
+  gstr_buff0 = GooString::format("{0:c}", (*u < 0x7F ? *u & 0xFF : '?'));
+  for (unsigned i = 1; i < uLen; i++) {
+    if (u[i] < 0x7F) {
+      gstr_buff1 = gstr_buff0->append(u[i] < 0x7F ? static_cast<char>(u[i]) & 0xFF : '?');
+      delete gstr_buff0;
+      gstr_buff0 = gstr_buff1;
+    }
+  }
+
+  return gstr_buff0->getCString();
+}
+
 //------------------------------------------------------------------------
 // HtmlString
 //------------------------------------------------------------------------
 
-HtmlString::HtmlString(GfxState *state, double fontSize, HtmlFontAccu* fonts) {
+HtmlString::HtmlString(GfxState *state, double fontSize, HtmlFontAccu* _fonts) : fonts(_fonts) {
   GfxFont *font;
   double x, y;
 
@@ -129,6 +163,22 @@ HtmlString::HtmlString(GfxState *state, double fontSize, HtmlFontAccu* fonts) {
     GooString *name = state->getFont()->getName();
     if (!name) name = HtmlFont::getDefaultFont(); //new GooString("default");
     HtmlFont hfont=HtmlFont(name, static_cast<int>(fontSize-1), rgb);
+    if (isMatRotOrSkew(state->getTextMat())) {
+      double normalizedMatrix[4];
+      memcpy(normalizedMatrix, state->getTextMat(), sizeof(normalizedMatrix));
+      // browser rotates the opposite way
+      // so flip the sign of the angle -> sin() components change sign
+      if (debug)
+        std::cerr << DEBUG << "before transform: " << print_matrix(normalizedMatrix) << std::endl;
+      normalizedMatrix[1] *= -1;
+      normalizedMatrix[2] *= -1;
+      if (debug)
+        std::cerr << DEBUG << "after reflecting angle: " << print_matrix(normalizedMatrix) << std::endl;
+      normalizeRotMat(normalizedMatrix);
+      if (debug)
+        std::cerr << DEBUG << "after norm: " << print_matrix(normalizedMatrix) << std::endl;
+      hfont.setRotMat(normalizedMatrix);
+    }
     fontpos = fonts->AddFont(hfont);
   } else {
     // this means that the PDF file draws text without a current font,
@@ -301,9 +351,27 @@ void HtmlPage::addChar(GfxState *state, double x, double y,
   // and is not too far away from it before adding 
   //if ((UnicodeMap::getDirection(u[0]) != curStr->dir) || 
   // XXX
-  if (
-     (n > 0 && 
-      fabs(x1 - curStr->xRight[n-1]) > 0.1 * (curStr->yMax - curStr->yMin))) {
+  if (debug) {
+    double *text_mat = state->getTextMat();
+    // rotation is (cos q, sin q, -sin q, cos q, 0, 0)
+    // sin q is zero iff there is no rotation, or 180 deg. rotation;
+    // for 180 rotation, cos q will be negative
+    if (text_mat[0] < 0 || !is_within(text_mat[1], .1, 0)) {
+      std::cerr << DEBUG << "rotation matrix for \"" << print_uni_str(u, uLen) << '"' << std::endl;
+      std::cerr << "text " << print_matrix(state->getTextMat());
+    }
+  }
+  if (n > 0 && // don't start a new string, unless there is already a string
+      // TODO: the following line assumes that text is flowing left to
+      // right, which will not necessarily be the case, e.g. if rotated;
+      // It assesses whether or not two characters are close enough to
+      // be part of the same string
+      fabs(x1 - curStr->xRight[n-1]) > 0.1 * (curStr->yMax - curStr->yMin) &&
+      // rotation is (cos q, sin q, -sin q, cos q, 0, 0)
+      // sin q is zero iff there is no rotation, or 180 deg. rotation;
+      // for 180 rotation, cos q will be negative
+      !rot_matrices_equal(curStr->getFont().getRotMat(), state->getTextMat()))
+  {
     endString();
     beginString(state, NULL);
   }
@@ -546,13 +614,13 @@ void HtmlPage::coalesce() {
 					str1->size * sizeof(double));
       if (addSpace) {
 		  str1->text[str1->len] = 0x20;
-		  str1->htext->append(xml?" ":"&nbsp;");
+		  str1->htext->append(xml?" ":"&#160;");
 		  str1->xRight[str1->len] = str2->xMin;
 		  ++str1->len;
       }
       if (addLineBreak) {
 	  str1->text[str1->len] = '\n';
-	  str1->htext->append("<br>");
+	  str1->htext->append("<br/>");
 	  str1->xRight[str1->len] = str2->xMin;
 	  ++str1->len;
 	  str1->yMin = str2->yMin;
@@ -660,31 +728,22 @@ void HtmlPage::dumpAsXML(FILE* f,int page){
     delete fontCSStyle;
   }
   
-  GooString *str, *str1 = NULL;
   for(HtmlString *tmp=yxStrings;tmp;tmp=tmp->yxNext){
     if (tmp->htext){
-      str=new GooString(tmp->htext);
       fprintf(f,"<text top=\"%d\" left=\"%d\" ",xoutRound(tmp->yMin),xoutRound(tmp->xMin));
       fprintf(f,"width=\"%d\" height=\"%d\" ",xoutRound(tmp->xMax-tmp->xMin),xoutRound(tmp->yMax-tmp->yMin));
       fprintf(f,"font=\"%d\">", tmp->fontpos);
-      str1=fonts->getCSStyle(tmp->fontpos, str);
-      fputs(str1->getCString(),f);
-      delete str;
-      delete str1;
+      fputs(tmp->htext->getCString(),f);
       fputs("</text>\n",f);
     }
   }
   fputs("</page>\n",f);
 }
 
-
-void HtmlPage::dumpComplex(FILE *file, int page){
-  FILE* pageFile;
+int HtmlPage::dumpComplexHeaders(FILE * const file, FILE *& pageFile, int page) {
   GooString* tmp;
   char* htmlEncoding;
 
-  if( firstPage == -1 ) firstPage = page; 
-  
   if( !noframes )
   {
       GooString* pgNum=GooString::fromInt(page);
@@ -700,22 +759,21 @@ void HtmlPage::dumpComplex(FILE *file, int page){
       if (!pageFile) {
 	  error(-1, "Couldn't open html file '%s'", tmp->getCString());
 	  delete tmp;
-	  return;
+	  return 1;
       } 
 
       if (!singleHtml)
-          fprintf(pageFile,"%s\n<HTML>\n<HEAD>\n<TITLE>Page %d</TITLE>\n\n", DOCTYPE, page);
+        fprintf(pageFile,"%s\n<HTML xmlns=\"http://www.w3.org/1999/xhtml\" lang=\"\" xml:lang=\"\">\n<HEAD>\n<TITLE>Page %d</TITLE>\n\n", DOCTYPE, page);
       else
-          fprintf(pageFile,"%s\n<HTML>\n<HEAD>\n<TITLE>%s</TITLE>\n\n", DOCTYPE, tmp->getCString());
+        fprintf(pageFile,"%s\n<HTML xmlns=\"http://www.w3.org/1999/xhtml\" lang=\"\" xml:lang=\"\">\n<HEAD>\n<TITLE>%s</TITLE>\n\n", DOCTYPE, tmp->getCString());
 
       delete tmp;
 
-      htmlEncoding = HtmlOutputDev::mapEncodingToHtml
-	  (globalParams->getTextEncodingName());
+      htmlEncoding = HtmlOutputDev::mapEncodingToHtml(globalParams->getTextEncodingName());
       if (!singleHtml)
-          fprintf(pageFile, "<META http-equiv=\"Content-Type\" content=\"text/html; charset=%s\">\n", htmlEncoding);
+        fprintf(pageFile, "<META http-equiv=\"Content-Type\" content=\"text/html; charset=%s\"/>\n", htmlEncoding);
       else
-          fprintf(pageFile, "<META http-equiv=\"Content-Type\" content=\"text/html; charset=%s\">\n <br>\n", htmlEncoding);
+        fprintf(pageFile, "<META http-equiv=\"Content-Type\" content=\"text/html; charset=%s\"/>\n <br/>\n", htmlEncoding);
   }
   else 
   {
@@ -724,12 +782,21 @@ void HtmlPage::dumpComplex(FILE *file, int page){
       fprintf(pageFile,"<a name=\"%d\"></a>\n", page);
   } 
   
-  fprintf(pageFile,"<DIV style=\"position:relative;width:%d;height:%d;\">\n",
-	pageWidth, pageHeight);
+  return 0;
+}
+
+void HtmlPage::dumpComplex(FILE *file, int page){
+  FILE* pageFile;
+  GooString* tmp;
+
+  if( firstPage == -1 ) firstPage = page; 
+  
+  if (dumpComplexHeaders(file, pageFile, page)) { error(-1, "Couldn't write headers."); return; }
 
   tmp=basename(DocName);
    
   fputs("<STYLE type=\"text/css\">\n<!--\n",pageFile);
+  fputs("\tp {margin: 0; padding: 0;}",pageFile);
   for(int i=fontsPageMarker;i!=fonts->size();i++) {
     GooString *fontCSStyle;
     if (!singleHtml)
@@ -747,33 +814,33 @@ void HtmlPage::dumpComplex(FILE *file, int page){
       fputs("</HEAD>\n<BODY bgcolor=\"#A0A0A0\" vlink=\"blue\" link=\"blue\">\n",pageFile); 
   }
   
+  fprintf(pageFile,"<DIV id=\"page%d-div\" style=\"position:relative;width:%dpx;height:%dpx;\">\n",
+      page, pageWidth, pageHeight);
+
   if( !ignore ) 
   {
     fprintf(pageFile,
-	    "<IMG width=\"%d\" height=\"%d\" src=\"%s%03d.%s\" alt=\"background image\">\n",
+	    "<IMG width=\"%d\" height=\"%d\" src=\"%s%03d.%s\" alt=\"background image\"/>\n",
 	    pageWidth, pageHeight, tmp->getCString(), 
 		(page-firstPage+1), imgExt->getCString());
   }
   
   delete tmp;
   
-  GooString *str, *str1 = NULL;
   for(HtmlString *tmp1=yxStrings;tmp1;tmp1=tmp1->yxNext){
     if (tmp1->htext){
-      str=new GooString(tmp1->htext);
       fprintf(pageFile,
-	      "<DIV style=\"position:absolute;top:%d;left:%d\">",
+	      "<P style=\"position:absolute;top:%dpx;left:%dpx;white-space:nowrap\" class=\"ft",
 	      xoutRound(tmp1->yMin),
 	      xoutRound(tmp1->xMin));
-      fputs("<nobr>",pageFile); 
-      if (!singleHtml)
-          str1=fonts->getCSStyle(tmp1->fontpos, str);
-      else
-          str1=fonts->getCSStyle(tmp1->fontpos, str, page);
-      fputs(str1->getCString(),pageFile);
-      delete str;      
-      delete str1;
-      fputs("</nobr></DIV>\n",pageFile);
+      if (!singleHtml) {
+          fputc('0', pageFile);
+      } else {
+          fprintf(pageFile, "%d", page);
+      }
+      fprintf(pageFile,"%d\">", tmp1->fontpos);
+      fputs(tmp1->htext->getCString(), pageFile);
+      fputs("</P>\n", pageFile);
     }
   }
 
@@ -801,7 +868,7 @@ void HtmlPage::dump(FILE *f, int pageNum)
     int listlen=HtmlOutputDev::imgList->getLength();
     for (int i = 0; i < listlen; i++) {
       GooString *fName= (GooString *)HtmlOutputDev::imgList->del(0);
-      fprintf(f,"<IMG src=\"%s\"><br>\n",fName->getCString());
+      fprintf(f,"<IMG src=\"%s\"/><br/>\n",fName->getCString());
       delete fName;
     }
     HtmlOutputDev::imgNum=1;
@@ -812,7 +879,7 @@ void HtmlPage::dump(FILE *f, int pageNum)
 		str=new GooString(tmp->htext); 
 		fputs(str->getCString(),f);
 		delete str;      
-		fputs("<br>\n",f);  
+		fputs("<br/>\n",f);
       }
     }
 	fputs("<hr>\n",f);  
@@ -879,7 +946,7 @@ GooString* HtmlMetaVar::toString()
     result->append(name);
     result->append("\" content=\"");
     result->append(content);
-    result->append("\">"); 
+    result->append("\"/>");
     return result;
 }
 
@@ -920,12 +987,12 @@ void HtmlOutputDev::doFrame(int firstPage){
   delete fName;
     
   fName=basename(Docname);
-  fputs(DOCTYPE_FRAMES, fContentsFrame);
+  fputs(DOCTYPE, fContentsFrame);
   fputs("\n<HTML>",fContentsFrame);
   fputs("\n<HEAD>",fContentsFrame);
   fprintf(fContentsFrame,"\n<TITLE>%s</TITLE>",docTitle->getCString());
   htmlEncoding = mapEncodingToHtml(globalParams->getTextEncodingName());
-  fprintf(fContentsFrame, "\n<META http-equiv=\"Content-Type\" content=\"text/html; charset=%s\">\n", htmlEncoding);
+  fprintf(fContentsFrame, "\n<META http-equiv=\"Content-Type\" content=\"text/html; charset=%s\"/>\n", htmlEncoding);
   dumpMetaVars(fContentsFrame);
   fprintf(fContentsFrame, "</HEAD>\n");
   fputs("<FRAMESET cols=\"100,*\">\n",fContentsFrame);
@@ -995,12 +1062,12 @@ HtmlOutputDev::HtmlOutputDev(char *fileName, char *title,
          }
          delete left;
          fputs(DOCTYPE, fContentsFrame);
-         fputs("<HTML>\n<HEAD>\n<TITLE></TITLE>\n</HEAD>\n<BODY>\n",fContentsFrame);
+         fputs("<HTML xmlns=\"http://www.w3.org/1999/xhtml\" lang=\"\" xml:lang=\"\">\n<HEAD>\n<TITLE></TITLE>\n</HEAD>\n<BODY>\n", fContentsFrame);
 
          if (doOutline)
          {
              GooString *str = basename(Docname);
-             fprintf(fContentsFrame, "<A href=\"%s%s\" target=\"contents\">Outline</a><br>", str->getCString(), complexMode ? "-outline.html" : "s.html#outline");
+             fprintf(fContentsFrame, "<A href=\"%s%s\" target=\"contents\">Outline</a><br/>", str->getCString(), complexMode ? "-outline.html" : "s.html#outline");
              delete str;
          }
      }
@@ -1044,10 +1111,9 @@ HtmlOutputDev::HtmlOutputDev(char *fileName, char *title,
     } 
     else 
     {
-      fprintf(page,"%s\n<HTML>\n<HEAD>\n<TITLE>%s</TITLE>\n",
-	      DOCTYPE, docTitle->getCString());
+      fprintf(page,"%s\n<HTML xmlns=\"http://www.w3.org/1999/xhtml\" lang=\"\" xml:lang=\"\">\n<HEAD>\n<TITLE>%s</TITLE>\n", DOCTYPE, docTitle->getCString());
       
-      fprintf(page, "<META http-equiv=\"Content-Type\" content=\"text/html; charset=%s\">\n", htmlEncoding);
+      fprintf(page, "<META http-equiv=\"Content-Type\" content=\"text/html; charset=%s\"/>\n", htmlEncoding);
       
       dumpMetaVars(page);
       fprintf(page,"</HEAD>\n");
@@ -1113,7 +1179,7 @@ void HtmlOutputDev::startPage(int pageNum, GfxState *state) {
 		fprintf(fContentsFrame,"<A href=\"%s-%d.html\"",str->getCString(),pageNum);
       else 
 		fprintf(fContentsFrame,"<A href=\"%ss.html#%d\"",str->getCString(),pageNum);
-      fprintf(fContentsFrame," target=\"contents\" >Page %d</a><br>\n",pageNum);
+      fprintf(fContentsFrame," target=\"contents\" >Page %d</a><br/>\n",pageNum);
     }
   }
 
@@ -1520,7 +1586,7 @@ GBool HtmlOutputDev::dumpDocOutline(Catalog* catalog)
 				return gFalse;
 			delete str;
 			bClose = gTrue;
-     		fputs("<HTML>\n<HEAD>\n<TITLE>Document Outline</TITLE>\n</HEAD>\n<BODY>\n", output);
+     		fputs("<HTML xmlns=\"http://www.w3.org/1999/xhtml\" lang=\"\" xml:lang=\"\">\n<HEAD>\n<TITLE>Document Outline</TITLE>\n</HEAD>\n<BODY>\n", output);
 		}
 	}
  
diff --git a/utils/HtmlOutputDev.h b/utils/HtmlOutputDev.h
index c268ce7..474e3af 100644
--- a/utils/HtmlOutputDev.h
+++ b/utils/HtmlOutputDev.h
@@ -19,6 +19,8 @@
 // Copyright (C) 2009, 2011 Carlos Garcia Campos <carlosgc at gnome.org>
 // Copyright (C) 2009 Kovid Goyal <kovid at kovidgoyal.net>
 // Copyright (C) 2010 Hib Eris <hib at hiberis.nl>
+// Copyright (C) 2011 Joshua Richardson <jric at chegg.com>
+// Copyright (C) 2011 Stephen Reichling <sreichling at chegg.com>
 //
 // To see a description of the changes please see the Changelog file that
 // came with your tarball or type make ChangeLog if you are building from git
@@ -52,8 +54,7 @@
 
 #define xoutRound(x) ((int)(x + 0.5))
 
-#define DOCTYPE "<!DOCTYPE HTML PUBLIC \"-//W3C//DTD HTML 4.01 Transitional//EN\">"
-#define DOCTYPE_FRAMES "<!DOCTYPE HTML PUBLIC \"-//W3C//DTD HTML 4.01 Frameset//EN\"\n\"http://www.w3.org/TR/html4/frameset.dtd\">"
+#define DOCTYPE "<!DOCTYPE html>"
 
 class GfxState;
 class GooString;
@@ -83,6 +84,7 @@ public:
 	       double dx, double dy,
 	       Unicode u); 
   HtmlLink* getLink() { return link; }
+  const HtmlFont &getFont() const { return *fonts->Get(fontpos); }
   void endString(); // postprocessing
 
 private:
@@ -100,6 +102,7 @@ private:
   int len;			// length of text and xRight
   int size;			// size of text and xRight arrays
   UnicodeTextDirection dir;	// direction (left to right/right to left)
+  HtmlFontAccu *fonts;
   
   friend class HtmlPage;
 
@@ -171,6 +174,7 @@ private:
   void setDocName(char* fname);
   void dumpAsXML(FILE* f,int page);
   void dumpComplex(FILE* f, int page);
+  int dumpComplexHeaders(FILE * const file, FILE *& pageFile, int page);
 
   // marks the position of the fonts that belong to current page (for noframes)
   int fontsPageMarker; 
diff --git a/utils/HtmlUtils.h b/utils/HtmlUtils.h
new file mode 100644
index 0000000..bdb89b9
--- /dev/null
+++ b/utils/HtmlUtils.h
@@ -0,0 +1,51 @@
+//
+// HtmlUtils.h
+//
+//  Created on: Jun 8, 2011
+//      Author: Joshua Richardson <jric at chegg.com>
+//  Copyright 2011
+//
+// All changes made under the Poppler project to this file are licensed
+// under GPL version 2 or later
+//
+// Copyright (C) 2011 Joshua Richardson <jric at chegg.com>
+//
+// To see a description of the changes please see the Changelog file that
+// came with your tarball or type make ChangeLog if you are building from git
+//
+//========================================================================
+
+#ifndef HTMLUTILS_H_
+#define HTMLUTILS_H_
+
+#include <math.h> // fabs
+#include "goo/gtypes.h" // GBool
+
+// Returns true iff the difference between a and b is less than the threshold
+// We always use fuzzy math when comparing decimal numbers due to imprecision
+inline GBool is_within(double a, double thresh, double b) {
+	return fabs(a-b) < thresh;
+}
+
+inline GBool rot_matrices_equal(const double * const mat0, const double * const mat1) {
+	return is_within(mat0[0], .1, mat1[0]) && is_within(mat0[1], .1, mat1[1]) &&
+			is_within(mat0[2], .1, mat1[2]) && is_within(mat0[3], .1, mat1[3]);
+}
+
+// rotation is (cos q, sin q, -sin q, cos q, 0, 0)
+// sin q is zero iff there is no rotation, or 180 deg. rotation;
+// for 180 rotation, cos q will be negative
+inline GBool isMatRotOrSkew(const double * const mat) {
+	return mat[0] < 0 || !is_within(mat[1], .1, 0);
+}
+
+// Alters the matrix so that it does not scale a vector's x component;
+// If the matrix does not skew, then that will also normalize the y
+//  component, keeping any rotation, but removing scaling.
+inline void normalizeRotMat(double *mat) {
+	double scale = fabs(mat[0] + mat[1]);
+	if (!scale) return;
+	for (int i = 0; i < 4; i++) mat[i] /= scale;
+}
+
+#endif /* HTMLUTILS_H_ */


More information about the poppler mailing list