[poppler] poppler/utils: HtmlFonts.cc, NONE, 1.1 HtmlFonts.h, NONE,
1.1 HtmlLinks.cc, NONE, 1.1 HtmlLinks.h, NONE,
1.1 HtmlOutputDev.cc, NONE, 1.1 HtmlOutputDev.h, NONE,
1.1 ImageOutputDev.cc, NONE, 1.1 ImageOutputDev.h, NONE,
1.1 Makefile.am, NONE, 1.1 parseargs.c, NONE, 1.1 parseargs.h,
NONE, 1.1 pdffonts.1, NONE, 1.1 pdffonts.cc, NONE,
1.1 pdfimages.1, NONE, 1.1 pdfimages.cc, NONE, 1.1 pdfinfo.1,
NONE, 1.1 pdfinfo.cc, NONE, 1.1 pdftohtml.1, NONE,
1.1 pdftohtml.cc, NONE, 1.1 pdftoppm.1, NONE, 1.1 pdftoppm.cc,
NONE, 1.1 pdftops.1, NONE, 1.1 pdftops.cc, NONE,
1.1 pdftotext.1, NONE, 1.1 pdftotext.cc, NONE, 1.1
Kristian Høgsberg
krh at freedesktop.org
Mon Dec 12 12:15:13 PST 2005
Update of /cvs/poppler/poppler/utils
In directory gabe:/tmp/cvs-serv21377/utils
Added Files:
HtmlFonts.cc HtmlFonts.h HtmlLinks.cc HtmlLinks.h
HtmlOutputDev.cc HtmlOutputDev.h ImageOutputDev.cc
ImageOutputDev.h Makefile.am parseargs.c parseargs.h
pdffonts.1 pdffonts.cc pdfimages.1 pdfimages.cc pdfinfo.1
pdfinfo.cc pdftohtml.1 pdftohtml.cc pdftoppm.1 pdftoppm.cc
pdftops.1 pdftops.cc pdftotext.1 pdftotext.cc
Log Message:
2005-12-12 Kristian Høgsberg <krh at redhat.com>
* Makefile.am:
* configure.ac:
* goo/GooVector.h:
* utils/HtmlFonts.cc:
* utils/HtmlFonts.h:
* utils/HtmlLinks.cc:
* utils/HtmlLinks.h:
* utils/HtmlOutputDev.cc:
* utils/HtmlOutputDev.h:
* utils/ImageOutputDev.cc:
* utils/ImageOutputDev.h:
* utils/Makefile.am:
* utils/parseargs.c:
* utils/parseargs.h:
* utils/pdffonts.1:
* utils/pdffonts.cc:
* utils/pdfimages.1:
* utils/pdfimages.cc:
* utils/pdfinfo.1:
* utils/pdfinfo.cc:
* utils/pdftohtml.1:
* utils/pdftohtml.cc:
* utils/pdftoppm.1:
* utils/pdftoppm.cc:
* utils/pdftops.1:
* utils/pdftops.cc:
* utils/pdftotext.1:
* utils/pdftotext.cc: Add command line utilities from xpdf.
--- NEW FILE: HtmlFonts.cc ---
#include "HtmlFonts.h"
#include "GlobalParams.h"
#include "UnicodeMap.h"
#include <stdio.h>
struct Fonts{
char *Fontname;
char *name;
};
const int font_num=13;
static Fonts fonts[font_num+1]={
{"Courier", "Courier" },
{"Courier-Bold", "Courier"},
{"Courier-BoldOblique", "Courier"},
{"Courier-Oblique", "Courier"},
{"Helvetica", "Helvetica"},
{"Helvetica-Bold", "Helvetica"},
{"Helvetica-BoldOblique", "Helvetica"},
{"Helvetica-Oblique", "Helvetica"},
{"Symbol", "Symbol" },
{"Times-Bold", "Times" },
{"Times-BoldItalic", "Times" },
{"Times-Italic", "Times" },
{"Times-Roman", "Times" },
{" " , "Times" },
};
#define xoutRound(x) ((int)(x + 0.5))
extern GBool xml;
GooString* HtmlFont::DefaultFont=new GooString("Times"); // Arial,Helvetica,sans-serif
HtmlFontColor::HtmlFontColor(GfxRGB rgb){
r=static_cast<int>(255*rgb.r);
g=static_cast<int>(255*rgb.g);
b=static_cast<int>(255*rgb.b);
if (!(Ok(r)&&Ok(b)&&Ok(g))) {printf("Error : Bad color \n");r=0;g=0;b=0;}
}
GooString *HtmlFontColor::convtoX(unsigned int xcol) const{
GooString *xret=new GooString();
char tmp;
unsigned int k;
k = (xcol/16);
if ((k>=0)&&(k<10)) tmp=(char) ('0'+k); else tmp=(char)('a'+k-10);
xret->append(tmp);
k = (xcol%16);
if ((k>=0)&&(k<10)) tmp=(char) ('0'+k); else tmp=(char)('a'+k-10);
xret->append(tmp);
return xret;
}
GooString *HtmlFontColor::toString() const{
GooString *tmp=new GooString("#");
GooString *tmpr=convtoX(r);
GooString *tmpg=convtoX(g);
GooString *tmpb=convtoX(b);
tmp->append(tmpr);
tmp->append(tmpg);
tmp->append(tmpb);
delete tmpr;
delete tmpg;
delete tmpb;
return tmp;
}
HtmlFont::HtmlFont(GooString* ftname,int _size, GfxRGB rgb){
//if (col) color=HtmlFontColor(col);
//else color=HtmlFontColor();
color=HtmlFontColor(rgb);
GooString *fontname = NULL;
if( ftname ){
fontname = new GooString(ftname);
FontName=new GooString(ftname);
}
else {
fontname = NULL;
FontName = NULL;
}
lineSize = -1;
size=(_size-1);
italic = gFalse;
bold = gFalse;
if (fontname){
if (strstr(fontname->lowerCase()->getCString(),"bold")) bold=gTrue;
if (strstr(fontname->lowerCase()->getCString(),"italic")||
strstr(fontname->lowerCase()->getCString(),"oblique")) italic=gTrue;
int i=0;
while (strcmp(ftname->getCString(),fonts[i].Fontname)&&(i<font_num))
{
i++;
}
pos=i;
delete fontname;
}
if (!DefaultFont) DefaultFont=new GooString(fonts[font_num].name);
}
HtmlFont::HtmlFont(const HtmlFont& x){
size=x.size;
lineSize=x.lineSize;
italic=x.italic;
bold=x.bold;
pos=x.pos;
color=x.color;
if (x.FontName) FontName=new GooString(x.FontName);
}
HtmlFont::~HtmlFont(){
if (FontName) delete FontName;
}
HtmlFont& HtmlFont::operator=(const HtmlFont& x){
if (this==&x) return *this;
size=x.size;
lineSize=x.lineSize;
italic=x.italic;
bold=x.bold;
pos=x.pos;
color=x.color;
if (FontName) delete FontName;
if (x.FontName) FontName=new GooString(x.FontName);
return *this;
}
void HtmlFont::clear(){
if(DefaultFont) delete DefaultFont;
DefaultFont = NULL;
}
/*
This function is used to compare font uniquily for insertion into
the list of all encountered fonts
*/
GBool HtmlFont::isEqual(const HtmlFont& x) const{
return ((size==x.size) &&
(lineSize==x.lineSize) &&
(pos==x.pos) && (bold==x.bold) && (italic==x.italic) &&
(color.isEqual(x.getColor())));
}
/*
This one is used to decide whether two pieces of text can be joined together
and therefore we don't care about bold/italics properties
*/
GBool HtmlFont::isEqualIgnoreBold(const HtmlFont& x) const{
return ((size==x.size) &&
(!strcmp(fonts[pos].name, fonts[x.pos].name)) &&
(color.isEqual(x.getColor())));
}
GooString* HtmlFont::getFontName(){
if (pos!=font_num) return new GooString(fonts[pos].name);
else return new GooString(DefaultFont);
}
GooString* HtmlFont::getFullName(){
if (FontName)
return new GooString(FontName);
else return new GooString(DefaultFont);
}
void HtmlFont::setDefaultFont(GooString* defaultFont){
if (DefaultFont) delete DefaultFont;
DefaultFont=new GooString(defaultFont);
}
GooString* HtmlFont::getDefaultFont(){
return DefaultFont;
}
// this method if plain wrong todo
GooString* HtmlFont::HtmlFilter(Unicode* u, int uLen) {
GooString *tmp = new GooString();
UnicodeMap *uMap;
char buf[8];
int n;
// get the output encoding
if (!(uMap = globalParams->getTextEncoding())) {
return tmp;
}
for (int i = 0; i < uLen; ++i) {
switch (u[i])
{
case '"': tmp->append("""); break;
case '&': tmp->append("&"); break;
case '<': tmp->append("<"); break;
case '>': tmp->append(">"); break;
default:
{
// convert unicode to string
if ((n = uMap->mapUnicode(u[i], buf, sizeof(buf))) > 0) {
tmp->append(buf, n);
}
}
}
}
uMap->decRefCnt();
return tmp;
}
GooString* HtmlFont::simple(HtmlFont* font, Unicode* content, int uLen){
GooString *cont=HtmlFilter (content, uLen);
/*if (font.isBold()) {
cont->insert(0,"<b>",3);
cont->append("</b>",4);
}
if (font.isItalic()) {
cont->insert(0,"<i>",3);
cont->append("</i>",4);
} */
return cont;
}
HtmlFontAccu::HtmlFontAccu(){
accu=new GooVector<HtmlFont>();
}
HtmlFontAccu::~HtmlFontAccu(){
if (accu) delete accu;
}
int HtmlFontAccu::AddFont(const HtmlFont& font){
GooVector<HtmlFont>::iterator i;
for (i=accu->begin();i!=accu->end();i++)
{
if (font.isEqual(*i))
{
return (int)(i-(accu->begin()));
}
}
accu->push_back(font);
return (accu->size()-1);
}
// get CSS font name for font #i
GooString* HtmlFontAccu::getCSStyle(int i, GooString* content){
GooString *tmp;
GooString *iStr=GooString::fromInt(i);
if (!xml) {
tmp = new GooString("<span class=\"ft");
tmp->append(iStr);
tmp->append("\">");
tmp->append(content);
tmp->append("</span>");
} else {
tmp = new GooString("");
tmp->append(content);
}
delete iStr;
return tmp;
}
// get CSS font definition for font #i
GooString* HtmlFontAccu::CSStyle(int i){
GooString *tmp=new GooString();
GooString *iStr=GooString::fromInt(i);
GooVector<HtmlFont>::iterator g=accu->begin();
g+=i;
HtmlFont font=*g;
GooString *Size=GooString::fromInt(font.getSize());
GooString *colorStr=font.getColor().toString();
GooString *fontName=font.getFontName();
GooString *lSize;
if(!xml){
tmp->append(".ft");
tmp->append(iStr);
tmp->append("{font-size:");
tmp->append(Size);
if( font.getLineSize() != -1 )
{
lSize = GooString::fromInt(font.getLineSize());
tmp->append("px;line-height:");
tmp->append(lSize);
delete lSize;
}
tmp->append("px;font-family:");
tmp->append(fontName); //font.getFontName());
tmp->append(";color:");
tmp->append(colorStr);
tmp->append(";}");
}
if (xml) {
tmp->append("<fontspec id=\"");
tmp->append(iStr);
tmp->append("\" size=\"");
tmp->append(Size);
tmp->append("\" family=\"");
tmp->append(fontName); //font.getFontName());
tmp->append("\" color=\"");
tmp->append(colorStr);
tmp->append("\"/>");
}
delete fontName;
delete colorStr;
delete iStr;
delete Size;
return tmp;
}
--- NEW FILE: HtmlFonts.h ---
#ifndef _HTML_FONTS_H
#define _HTML_FONTS_H
#include "goo/GooVector.h"
#include "goo/GooString.h"
#include "GfxState.h"
#include "CharTypes.h"
class HtmlFontColor{
private:
unsigned int r;
unsigned int g;
unsigned int b;
GBool Ok(unsigned int xcol){ return ((xcol<=255)&&(xcol>=0));}
GooString *convtoX(unsigned int xcol) const;
public:
HtmlFontColor():r(0),g(0),b(0){}
HtmlFontColor(GfxRGB rgb);
HtmlFontColor(const HtmlFontColor& x){r=x.r;g=x.g;b=x.b;}
HtmlFontColor& operator=(const HtmlFontColor &x){
r=x.r;g=x.g;b=x.b;
return *this;
}
~HtmlFontColor(){};
GooString* toString() const;
GBool isEqual(const HtmlFontColor& col) const{
return ((r==col.r)&&(g==col.g)&&(b==col.b));
}
} ;
class HtmlFont{
private:
unsigned int size;
int lineSize;
GBool italic;
GBool bold;
int pos; // position of the font name in the fonts array
static GooString *DefaultFont;
GooString *FontName;
HtmlFontColor color;
static GooString* HtmlFilter(Unicode* u, int uLen); //char* s);
public:
HtmlFont(){FontName=NULL;};
HtmlFont(GooString* fontname,int _size, GfxRGB rgb);
HtmlFont(const HtmlFont& x);
HtmlFont& operator=(const HtmlFont& x);
HtmlFontColor getColor() const {return color;}
~HtmlFont();
static void clear();
GooString* getFullName();
GBool isItalic() const {return italic;}
GBool isBold() const {return bold;}
unsigned int getSize() const {return size;}
int getLineSize() const {return lineSize;}
void setLineSize(int _lineSize) { lineSize = _lineSize; }
GooString* getFontName();
static GooString* getDefaultFont();
static void setDefaultFont(GooString* defaultFont);
GBool isEqual(const HtmlFont& x) const;
GBool isEqualIgnoreBold(const HtmlFont& x) const;
static GooString* simple(HtmlFont *font, Unicode *content, int uLen);
void print() const {printf("font: %s %d %s%spos: %d\n", FontName->getCString(), size, bold ? "bold " : "", italic ? "italic " : "", pos);};
};
class HtmlFontAccu{
private:
GooVector<HtmlFont> *accu;
public:
HtmlFontAccu();
~HtmlFontAccu();
int AddFont(const HtmlFont& font);
HtmlFont* Get(int i){
GooVector<HtmlFont>::iterator g=accu->begin();
g+=i;
return g;
}
GooString* getCSStyle (int i, GooString* content);
GooString* CSStyle(int i);
int size() const {return accu->size();}
};
#endif
--- NEW FILE: HtmlLinks.cc ---
#include "HtmlLinks.h"
HtmlLink::HtmlLink(const HtmlLink& x){
Xmin=x.Xmin;
Ymin=x.Ymin;
Xmax=x.Xmax;
Ymax=x.Ymax;
dest=new GooString(x.dest);
}
HtmlLink::HtmlLink(double xmin,double ymin,double xmax,double ymax,GooString * _dest)
{
if (xmin < xmax) {
Xmin=xmin;
Xmax=xmax;
} else {
Xmin=xmax;
Xmax=xmin;
}
if (ymin < ymax) {
Ymin=ymin;
Ymax=ymax;
} else {
Ymin=ymax;
Ymax=ymin;
}
dest=new GooString(_dest);
}
HtmlLink::~HtmlLink(){
if (dest) delete dest;
}
GBool HtmlLink::isEqualDest(const HtmlLink& x) const{
return (!strcmp(dest->getCString(), x.dest->getCString()));
}
GBool HtmlLink::inLink(double xmin,double ymin,double xmax,double ymax) const {
double y=(ymin+ymax)/2;
if (y>Ymax) return gFalse;
return (y>Ymin)&&(xmin<Xmax)&&(xmax>Xmin);
}
HtmlLink& HtmlLink::operator=(const HtmlLink& x){
if (this==&x) return *this;
if (dest) {delete dest;dest=NULL;}
Xmin=x.Xmin;
Ymin=x.Ymin;
Xmax=x.Xmax;
Ymax=x.Ymax;
dest=new GooString(x.dest);
return *this;
}
GooString* HtmlLink::getLinkStart() {
GooString *res = new GooString("<A href=\"");
res->append(dest);
res->append("\">");
return res;
}
/*GooString* HtmlLink::Link(GooString* content){
//GooString* _dest=new GooString(dest);
GooString *tmp=new GooString("<a href=\"");
tmp->append(dest);
tmp->append("\">");
tmp->append(content);
tmp->append("</a>");
//delete _dest;
return tmp;
}*/
HtmlLinks::HtmlLinks(){
accu=new GooVector<HtmlLink>();
}
HtmlLinks::~HtmlLinks(){
delete accu;
accu=NULL;
}
GBool HtmlLinks::inLink(double xmin,double ymin,double xmax,double ymax,int& p)const {
for(GooVector<HtmlLink>::iterator i=accu->begin();i!=accu->end();i++){
if (i->inLink(xmin,ymin,xmax,ymax)) {
p=(i - accu->begin());
return 1;
}
}
return 0;
}
HtmlLink* HtmlLinks::getLink(int i) const{
GooVector<HtmlLink>::iterator g=accu->begin();
g+=i;
return g;
}
--- NEW FILE: HtmlLinks.h ---
#ifndef _HTML_LINKS
#define _HTML_LINKS
#include <stdlib.h>
#include <string.h>
#include "goo/GooVector.h"
#include "goo/GooString.h"
class HtmlLink{
private:
double Xmin;
double Ymin;
double Xmax;
double Ymax;
GooString* dest;
public:
HtmlLink(){dest=NULL;}
HtmlLink(const HtmlLink& x);
HtmlLink& operator=(const HtmlLink& x);
HtmlLink(double xmin,double ymin,double xmax,double ymax,GooString *_dest);
~HtmlLink();
GBool isEqualDest(const HtmlLink& x) const;
GooString *getDest(){return new GooString(dest);}
double getX1() const {return Xmin;}
double getX2() const {return Xmax;}
double getY1() const {return Ymin;}
double getY2() const {return Ymax;}
GBool inLink(double xmin,double ymin,double xmax,double ymax) const ;
//GooString *Link(GooString *content);
GooString* getLinkStart();
};
class HtmlLinks{
private:
GooVector<HtmlLink> *accu;
public:
HtmlLinks();
~HtmlLinks();
void AddLink(const HtmlLink& x) {accu->push_back(x);}
GBool inLink(double xmin,double ymin,double xmax,double ymax,int& p) const;
HtmlLink* getLink(int i) const;
};
#endif
--- NEW FILE: HtmlOutputDev.cc ---
//========================================================================
//
// HtmlOutputDev.cc
//
// Copyright 1997-2002 Glyph & Cog, LLC
//
// Changed 1999-2000 by G.Ovtcharov
//
// Changed 2002 by Mikhail Kruk
//
//========================================================================
#ifdef __GNUC__
#pragma implementation
#endif
#include <stdio.h>
#include <stdlib.h>
#include <stdarg.h>
[...1530 lines suppressed...]
return atLeastOne;
}
char* getFileNameFromPath(char* c, int strlen) {
int last_slash_index = 0;
int i = 0;
char* res;
for (i=0;i<strlen;i++) {
if (*(c+i)=='/') {
/* printf("/ detected\n"); */
last_slash_index = i;
}
}
res = (char *)malloc(sizeof(char)*strlen-last_slash_index+1);
strcpy(res,c+last_slash_index+(last_slash_index?1:0));
/* printf("Fil: %s\n",res); */
return res;
}
--- NEW FILE: HtmlOutputDev.h ---
//========================================================================
//
// HtmlOutputDev.h
//
// Copyright 1997 Derek B. Noonburg
//
// Changed 1999 by G.Ovtcharov
//========================================================================
#ifndef HTMLOUTPUTDEV_H
#define HTMLOUTPUTDEV_H
#ifdef __GNUC__
#pragma interface
#endif
#include <stdio.h>
#include "goo/gtypes.h"
#include "goo/GooList.h"
#include "GfxFont.h"
#include "OutputDev.h"
#include "HtmlLinks.h"
#include "HtmlFonts.h"
#include "Link.h"
#include "Catalog.h"
#include "UnicodeMap.h"
#ifdef WIN32
# define SLASH '\\'
#else
# define SLASH '/'
#endif
#define xoutRound(x) ((int)(x + 0.5))
#define DOCTYPE "<!DOCTYPE HTML PUBLIC \"-//W3C//DTD HTML 4.01 Transitional//EN\">"
#define DOCTYPE_FRAMES "<!DOCTYPE HTML PUBLIC \"-//W3C//DTD HTML 4.01 Frameset//EN\"\n\"http://www.w3.org/TR/html4/frameset.dtd\">"
class GfxState;
class GooString;
//------------------------------------------------------------------------
// HtmlString
//------------------------------------------------------------------------
enum UnicodeTextDirection {
textDirUnknown,
textDirLeftRight,
textDirRightLeft,
textDirTopBottom
};
class HtmlString {
public:
// Constructor.
HtmlString(GfxState *state, double fontSize, HtmlFontAccu* fonts);
// Destructor.
~HtmlString();
// Add a character to the string.
void addChar(GfxState *state, double x, double y,
double dx, double dy,
Unicode u);
HtmlLink* getLink() { return link; }
void endString(); // postprocessing
private:
// aender die text variable
HtmlLink *link;
double xMin, xMax; // bounding box x coordinates
double yMin, yMax; // bounding box y coordinates
int col; // starting column
Unicode *text; // the text
double *xRight; // right-hand x coord of each char
HtmlString *yxNext; // next string in y-major order
HtmlString *xyNext; // next string in x-major order
int fontpos;
GooString* htext;
int len; // length of text and xRight
int size; // size of text and xRight arrays
UnicodeTextDirection dir; // direction (left to right/right to left)
friend class HtmlPage;
};
//------------------------------------------------------------------------
// HtmlPage
//------------------------------------------------------------------------
class HtmlPage {
public:
// Constructor.
HtmlPage(GBool rawOrder, char *imgExtVal);
// Destructor.
~HtmlPage();
// Begin a new string.
void beginString(GfxState *state, GooString *s);
// Add a character to the current string.
void addChar(GfxState *state, double x, double y,
double dx, double dy,
double ox, double oy,
Unicode *u, int uLen); //Guchar c);
void updateFont(GfxState *state);
// End the current string, sorting it into the list of strings.
void endString();
// Coalesce strings that look like parts of the same line.
void coalesce();
// Find a string. If <top> is true, starts looking at top of page;
// otherwise starts looking at <xMin>,<yMin>. If <bottom> is true,
// stops looking at bottom of page; otherwise stops looking at
// <xMax>,<yMax>. If found, sets the text bounding rectange and
// returns true; otherwise returns false.
// new functions
void AddLink(const HtmlLink& x){
links->AddLink(x);
}
void dump(FILE *f, int pageNum);
// Clear the page.
void clear();
void conv();
private:
HtmlFont* getFont(HtmlString *hStr) { return fonts->Get(hStr->fontpos); }
double fontSize; // current font size
GBool rawOrder; // keep strings in content stream order
HtmlString *curStr; // currently active string
HtmlString *yxStrings; // strings in y-major order
HtmlString *xyStrings; // strings in x-major order
HtmlString *yxCur1, *yxCur2; // cursors for yxStrings list
void setDocName(char* fname);
void dumpAsXML(FILE* f,int page);
void dumpComplex(FILE* f, int page);
// marks the position of the fonts that belong to current page (for noframes)
int fontsPageMarker;
HtmlFontAccu *fonts;
HtmlLinks *links;
GooString *DocName;
GooString *imgExt;
int pageWidth;
int pageHeight;
static int pgNum;
int firstPage; // used to begin the numeration of pages
friend class HtmlOutputDev;
};
//------------------------------------------------------------------------
// HtmlMetaVar
//------------------------------------------------------------------------
class HtmlMetaVar {
public:
HtmlMetaVar(char *_name, char *_content);
~HtmlMetaVar();
GooString* toString();
private:
GooString *name;
GooString *content;
};
//------------------------------------------------------------------------
// HtmlOutputDev
//------------------------------------------------------------------------
class HtmlOutputDev: public OutputDev {
public:
// Open a text output file. If <fileName> is NULL, no file is written
// (this is useful, e.g., for searching text). If <useASCII7> is true,
// text is converted to 7-bit ASCII; otherwise, text is converted to
// 8-bit ISO Latin-1. <useASCII7> should also be set for Japanese
// (EUC-JP) text. If <rawOrder> is true, the text is kept in content
// stream order.
HtmlOutputDev(char *fileName, char *title,
char *author,
char *keywords,
char *subject,
char *date,
char *extension,
GBool rawOrder,
int firstPage = 1,
GBool outline = 0);
// Destructor.
virtual ~HtmlOutputDev();
// Check if file was successfully created.
virtual GBool isOk() { return ok; }
//---- get info about output device
// Does this device use upside-down coordinates?
// (Upside-down means (0,0) is the top left corner of the page.)
virtual GBool upsideDown() { return gTrue; }
// Does this device use drawChar() or drawString()?
virtual GBool useDrawChar() { return gTrue; }
// Does this device use beginType3Char/endType3Char? Otherwise,
// text in Type 3 fonts will be drawn with drawChar/drawString.
virtual GBool interpretType3Chars() { return gFalse; }
// Does this device need non-text content?
virtual GBool needNonText() { return gFalse; }
//----- initialization and control
// Start a page.
virtual void startPage(int pageNum, GfxState *state);
// End a page.
virtual void endPage();
//----- update text state
virtual void updateFont(GfxState *state);
//----- text drawing
virtual void beginString(GfxState *state, GooString *s);
virtual void endString(GfxState *state);
virtual void drawChar(GfxState *state, double x, double y,
double dx, double dy,
double originX, double originY,
CharCode code, Unicode *u, int uLen);
virtual void drawImageMask(GfxState *state, Object *ref,
Stream *str,
int width, int height, GBool invert,
GBool inlineImg);
virtual void drawImage(GfxState *state, Object *ref, Stream *str,
int width, int height, GfxImageColorMap *colorMap,
int *maskColors, GBool inlineImg);
//new feature
virtual int DevType() {return 1234;}
virtual void drawLink(Link *link,Catalog *cat);
int getPageWidth() { return maxPageWidth; }
int getPageHeight() { return maxPageHeight; }
GBool dumpDocOutline(Catalog* catalog);
/* char* getFileNameFromPath(char* c, int strlen); */
private:
// convert encoding into a HTML standard, or encoding->getCString if not
// recognized
static char* mapEncodingToHtml(GooString* encoding);
GooString* getLinkDest(Link *link,Catalog *catalog);
void dumpMetaVars(FILE *);
void doFrame(int firstPage);
GBool newOutlineLevel(FILE *output, Object *node, Catalog* catalog, int level = 1);
FILE *fContentsFrame;
FILE *page; // html file
//FILE *tin; // image log file
//GBool write;
GBool needClose; // need to close the file?
HtmlPage *pages; // text for the current page
GBool rawOrder; // keep text in content stream order
GBool doOutline; // output document outline
GBool ok; // set up ok?
GBool dumpJPEG;
int pageNum;
int maxPageWidth;
int maxPageHeight;
static int imgNum;
GooString *Docname;
GooString *docTitle;
GooList *glMetaVars;
friend class HtmlPage;
};
char* getFileNameFromPath(char* c, int strlen);
#endif
--- NEW FILE: ImageOutputDev.cc ---
//========================================================================
//
// ImageOutputDev.cc
//
// Copyright 1998-2003 Glyph & Cog, LLC
//
//========================================================================
#include <poppler-config.h>
#ifdef USE_GCC_PRAGMAS
#pragma implementation
#endif
#include <stdio.h>
#include <stdlib.h>
#include <stddef.h>
#include <ctype.h>
#include "goo/gmem.h"
#include "config.h"
#include "Error.h"
#include "GfxState.h"
#include "Object.h"
#include "Stream.h"
#include "DCTStream.h"
#include "ImageOutputDev.h"
ImageOutputDev::ImageOutputDev(char *fileRootA, GBool dumpJPEGA) {
fileRoot = copyString(fileRootA);
fileName = (char *)gmalloc(strlen(fileRoot) + 20);
dumpJPEG = dumpJPEGA;
imgNum = 0;
ok = gTrue;
}
ImageOutputDev::~ImageOutputDev() {
gfree(fileName);
gfree(fileRoot);
}
void ImageOutputDev::drawImageMask(GfxState *state, Object *ref, Stream *str,
int width, int height, GBool invert,
GBool inlineImg) {
FILE *f;
int c;
int size, i;
// dump JPEG file
if (dumpJPEG && str->getKind() == strDCT && !inlineImg) {
// open the image file
sprintf(fileName, "%s-%03d.jpg", fileRoot, imgNum);
++imgNum;
if (!(f = fopen(fileName, "wb"))) {
error(-1, "Couldn't open image file '%s'", fileName);
return;
}
// initialize stream
str = ((DCTStream *)str)->getRawStream();
str->reset();
// copy the stream
while ((c = str->getChar()) != EOF)
fputc(c, f);
str->close();
fclose(f);
// dump PBM file
} else {
// open the image file and write the PBM header
sprintf(fileName, "%s-%03d.pbm", fileRoot, imgNum);
++imgNum;
if (!(f = fopen(fileName, "wb"))) {
error(-1, "Couldn't open image file '%s'", fileName);
return;
}
fprintf(f, "P4\n");
fprintf(f, "%d %d\n", width, height);
// initialize stream
str->reset();
// copy the stream
size = height * ((width + 7) / 8);
for (i = 0; i < size; ++i) {
fputc(str->getChar(), f);
}
str->close();
fclose(f);
}
}
void ImageOutputDev::drawImage(GfxState *state, Object *ref, Stream *str,
int width, int height,
GfxImageColorMap *colorMap,
int *maskColors, GBool inlineImg) {
FILE *f;
ImageStream *imgStr;
Guchar *p;
GfxRGB rgb;
int x, y;
int c;
int size, i;
// dump JPEG file
if (dumpJPEG && str->getKind() == strDCT &&
colorMap->getNumPixelComps() == 3 &&
!inlineImg) {
// open the image file
sprintf(fileName, "%s-%03d.jpg", fileRoot, imgNum);
++imgNum;
if (!(f = fopen(fileName, "wb"))) {
error(-1, "Couldn't open image file '%s'", fileName);
return;
}
// initialize stream
str = ((DCTStream *)str)->getRawStream();
str->reset();
// copy the stream
while ((c = str->getChar()) != EOF)
fputc(c, f);
str->close();
fclose(f);
// dump PBM file
} else if (colorMap->getNumPixelComps() == 1 &&
colorMap->getBits() == 1) {
// open the image file and write the PBM header
sprintf(fileName, "%s-%03d.pbm", fileRoot, imgNum);
++imgNum;
if (!(f = fopen(fileName, "wb"))) {
error(-1, "Couldn't open image file '%s'", fileName);
return;
}
fprintf(f, "P4\n");
fprintf(f, "%d %d\n", width, height);
// initialize stream
str->reset();
// copy the stream
size = height * ((width + 7) / 8);
for (i = 0; i < size; ++i) {
fputc(str->getChar() ^ 0xff, f);
}
str->close();
fclose(f);
// dump PPM file
} else {
// open the image file and write the PPM header
sprintf(fileName, "%s-%03d.ppm", fileRoot, imgNum);
++imgNum;
if (!(f = fopen(fileName, "wb"))) {
error(-1, "Couldn't open image file '%s'", fileName);
return;
}
fprintf(f, "P6\n");
fprintf(f, "%d %d\n", width, height);
fprintf(f, "255\n");
// initialize stream
imgStr = new ImageStream(str, width, colorMap->getNumPixelComps(),
colorMap->getBits());
imgStr->reset();
// for each line...
for (y = 0; y < height; ++y) {
// write the line
p = imgStr->getLine();
for (x = 0; x < width; ++x) {
colorMap->getRGB(p, &rgb);
fputc((int)(rgb.r * 255 + 0.5), f);
fputc((int)(rgb.g * 255 + 0.5), f);
fputc((int)(rgb.b * 255 + 0.5), f);
p += colorMap->getNumPixelComps();
}
}
delete imgStr;
fclose(f);
}
}
--- NEW FILE: ImageOutputDev.h ---
//========================================================================
//
// ImageOutputDev.h
//
// Copyright 1998-2003 Glyph & Cog, LLC
//
//========================================================================
#ifndef IMAGEOUTPUTDEV_H
#define IMAGEOUTPUTDEV_H
#include <poppler-config.h>
#ifdef USE_GCC_PRAGMAS
#pragma interface
#endif
#include <stdio.h>
#include "goo/gtypes.h"
#include "OutputDev.h"
class GfxState;
//------------------------------------------------------------------------
// ImageOutputDev
//------------------------------------------------------------------------
class ImageOutputDev: public OutputDev {
public:
// Create an OutputDev which will write images to files named
// <fileRoot>-NNN.<type>. Normally, all images are written as PBM
// (.pbm) or PPM (.ppm) files. If <dumpJPEG> is set, JPEG images are
// written as JPEG (.jpg) files.
ImageOutputDev(char *fileRootA, GBool dumpJPEGA);
// Destructor.
virtual ~ImageOutputDev();
// Check if file was successfully created.
virtual GBool isOk() { return ok; }
// Does this device use beginType3Char/endType3Char? Otherwise,
// text in Type 3 fonts will be drawn with drawChar/drawString.
virtual GBool interpretType3Chars() { return gFalse; }
// Does this device need non-text content?
virtual GBool needNonText() { return gFalse; }
//---- get info about output device
// Does this device use upside-down coordinates?
// (Upside-down means (0,0) is the top left corner of the page.)
virtual GBool upsideDown() { return gTrue; }
// Does this device use drawChar() or drawString()?
virtual GBool useDrawChar() { return gFalse; }
//----- image drawing
virtual void drawImageMask(GfxState *state, Object *ref, Stream *str,
int width, int height, GBool invert,
GBool inlineImg);
virtual void drawImage(GfxState *state, Object *ref, Stream *str,
int width, int height, GfxImageColorMap *colorMap,
int *maskColors, GBool inlineImg);
private:
char *fileRoot; // root of output file names
char *fileName; // buffer for output file names
GBool dumpJPEG; // set to dump native JPEG files
int imgNum; // current image number
GBool ok; // set up ok?
};
#endif
--- NEW FILE: Makefile.am ---
INCLUDES = \
-I$(top_srcdir) \
-I$(top_srcdir)/poppler
LDADD = \
$(top_builddir)/poppler/libpoppler.la
bin_PROGRAMS = pdffonts pdfimages pdfinfo pdftops pdftotext pdftohtml
man1_MANS = pdffonts.1 pdfimages.1 pdfinfo.1 pdftops.1 pdftotext.1 pdftohtml.1
pdffonts_SOURCES = pdffonts.cc parseargs.c
pdfimages_SOURCES = pdfimages.cc ImageOutputDev.cc parseargs.c
pdfinfo_SOURCES = pdfinfo.cc parseargs.c
pdftops_SOURCES = pdftops.cc parseargs.c
pdftotext_SOURCES = pdftotext.cc parseargs.c
pdftohtml_SOURCES = pdftohtml.cc parseargs.c \
HtmlFonts.cc HtmlLinks.cc HtmlOutputDev.cc
--- NEW FILE: parseargs.c ---
/*
* parseargs.h
*
* Command line argument parser.
*
* Copyright 1996-2003 Glyph & Cog, LLC
*/
#include <stdio.h>
#include <stddef.h>
#include <string.h>
#include <stdlib.h>
#include <ctype.h>
#include "parseargs.h"
static ArgDesc *findArg(ArgDesc *args, char *arg);
static GBool grabArg(ArgDesc *arg, int i, int *argc, char *argv[]);
GBool parseArgs(ArgDesc *args, int *argc, char *argv[]) {
ArgDesc *arg;
int i, j;
GBool ok;
ok = gTrue;
i = 1;
while (i < *argc) {
if (!strcmp(argv[i], "--")) {
--*argc;
for (j = i; j < *argc; ++j)
argv[j] = argv[j+1];
break;
} else if ((arg = findArg(args, argv[i]))) {
if (!grabArg(arg, i, argc, argv))
ok = gFalse;
} else {
++i;
}
}
return ok;
}
void printUsage(char *program, char *otherArgs, ArgDesc *args) {
ArgDesc *arg;
char *typ;
int w, w1;
w = 0;
for (arg = args; arg->arg; ++arg) {
if ((w1 = strlen(arg->arg)) > w)
w = w1;
}
fprintf(stderr, "Usage: %s [options]", program);
if (otherArgs)
fprintf(stderr, " %s", otherArgs);
fprintf(stderr, "\n");
for (arg = args; arg->arg; ++arg) {
fprintf(stderr, " %s", arg->arg);
w1 = 9 + w - strlen(arg->arg);
switch (arg->kind) {
case argInt:
case argIntDummy:
typ = " <int>";
break;
case argFP:
case argFPDummy:
typ = " <fp>";
break;
case argString:
case argStringDummy:
typ = " <string>";
break;
case argFlag:
case argFlagDummy:
default:
typ = "";
break;
}
fprintf(stderr, "%-*s", w1, typ);
if (arg->usage)
fprintf(stderr, ": %s", arg->usage);
fprintf(stderr, "\n");
}
}
static ArgDesc *findArg(ArgDesc *args, char *arg) {
ArgDesc *p;
for (p = args; p->arg; ++p) {
if (p->kind < argFlagDummy && !strcmp(p->arg, arg))
return p;
}
return NULL;
}
static GBool grabArg(ArgDesc *arg, int i, int *argc, char *argv[]) {
int n;
int j;
GBool ok;
ok = gTrue;
n = 0;
switch (arg->kind) {
case argFlag:
*(GBool *)arg->val = gTrue;
n = 1;
break;
case argInt:
if (i + 1 < *argc && isInt(argv[i+1])) {
*(int *)arg->val = atoi(argv[i+1]);
n = 2;
} else {
ok = gFalse;
n = 1;
}
break;
case argFP:
if (i + 1 < *argc && isFP(argv[i+1])) {
*(double *)arg->val = atof(argv[i+1]);
n = 2;
} else {
ok = gFalse;
n = 1;
}
break;
case argString:
if (i + 1 < *argc) {
strncpy((char *)arg->val, argv[i+1], arg->size - 1);
((char *)arg->val)[arg->size - 1] = '\0';
n = 2;
} else {
ok = gFalse;
n = 1;
}
break;
default:
fprintf(stderr, "Internal error in arg table\n");
n = 1;
break;
}
if (n > 0) {
*argc -= n;
for (j = i; j < *argc; ++j)
argv[j] = argv[j+n];
}
return ok;
}
GBool isInt(char *s) {
if (*s == '-' || *s == '+')
++s;
while (isdigit(*s))
++s;
if (*s)
return gFalse;
return gTrue;
}
GBool isFP(char *s) {
int n;
if (*s == '-' || *s == '+')
++s;
n = 0;
while (isdigit(*s)) {
++s;
++n;
}
if (*s == '.')
++s;
while (isdigit(*s)) {
++s;
++n;
}
if (n > 0 && (*s == 'e' || *s == 'E')) {
++s;
if (*s == '-' || *s == '+')
++s;
n = 0;
if (!isdigit(*s))
return gFalse;
do {
++s;
} while (isdigit(*s));
}
if (*s)
return gFalse;
return gTrue;
}
--- NEW FILE: parseargs.h ---
/*
* parseargs.h
*
* Command line argument parser.
*
* Copyright 1996-2003 Glyph & Cog, LLC
*/
#ifndef PARSEARGS_H
#define PARSEARGS_H
#ifdef __cplusplus
extern "C" {
#endif
#include "goo/gtypes.h"
/*
* Argument kinds.
*/
typedef enum {
argFlag, /* flag (present / not-present) */
/* [val: GBool *] */
argInt, /* integer arg */
/* [val: int *] */
argFP, /* floating point arg */
/* [val: double *] */
argString, /* string arg */
/* [val: char *] */
/* dummy entries -- these show up in the usage listing only; */
/* useful for X args, for example */
argFlagDummy,
argIntDummy,
argFPDummy,
argStringDummy
} ArgKind;
/*
* Argument descriptor.
*/
typedef struct {
char *arg; /* the command line switch */
ArgKind kind; /* kind of arg */
void *val; /* place to store value */
int size; /* for argString: size of string */
char *usage; /* usage string */
} ArgDesc;
/*
* Parse command line. Removes all args which are found in the arg
* descriptor list <args>. Stops parsing if "--" is found (and removes
* it). Returns gFalse if there was an error.
*/
extern GBool parseArgs(ArgDesc *args, int *argc, char *argv[]);
/*
* Print usage message, based on arg descriptor list.
*/
extern void printUsage(char *program, char *otherArgs, ArgDesc *args);
/*
* Check if a string is a valid integer or floating point number.
*/
extern GBool isInt(char *s);
extern GBool isFP(char *s);
#ifdef __cplusplus
}
#endif
#endif
--- NEW FILE: pdffonts.1 ---
.\" Copyright 1999-2004 Glyph & Cog, LLC
.TH pdffonts 1 "22 January 2004"
.SH NAME
pdffonts \- Portable Document Format (PDF) font analyzer (version
3.00)
.SH SYNOPSIS
.B pdffonts
[options]
.RI [ PDF-file ]
.SH DESCRIPTION
.B Pdffonts
lists the fonts used in a Portable Document Format (PDF) file along
with various information for each font.
.PP
The following information is listed for each font:
.TP
.B name
the font name, exactly as given in the PDF file (potentially including
a subset prefix)
.TP
.B type
the font type -- see below for details
.TP
.B emb
"yes" if the font is embedded in the PDF file
.TP
.B sub
"yes" if the font is a subset
.TP
.B uni
"yes" if there is an explicit "ToUnicode" map in the PDF file (the
absence of a ToUnicode map doesn't necessarily mean that the text
can't be converted to Unicode)
.TP
.B object ID
the font dictionary object ID (number and generation)
.PP
PDF files can contain the following types of fonts:
.PP
.RS
Type 1
.RE
.RS
Type 1C -- aka Compact Font Format (CFF)
.RE
.RS
Type 3
.RE
.RS
TrueType
.RE
.RS
CID Type 0 -- 16-bit font with no specified type
.RE
.RS
CID Type 0C -- 16-bit PostScript CFF font
.RE
.RS
CID TrueType -- 16-bit TrueType font
.RE
.SH CONFIGURATION FILE
Pdffonts reads a configuration file at startup. It first tries to
find the user's private config file, ~/.xpdfrc. If that doesn't
exist, it looks for a system-wide config file, /etc/xpdf/xpdfrc. See the
.BR xpdfrc (5)
man page for details.
.SH OPTIONS
Many of the following options can be set with configuration file
commands. These are listed in square brackets with the description of
the corresponding command line option.
.TP
.BI \-f " number"
Specifies the first page to analyze.
.TP
.BI \-l " number"
Specifies the last page to analyze.
.TP
.BI \-opw " password"
Specify the owner password for the PDF file. Providing this will
bypass all security restrictions.
.TP
.BI \-upw " password"
Specify the user password for the PDF file.
.TP
.BI \-cfg " config-file"
Read
.I config-file
in place of ~/.xpdfrc or the system-wide config file.
.TP
.B \-v
Print copyright and version information.
.TP
.B \-h
Print usage information.
.RB ( \-help
and
.B \-\-help
are equivalent.)
.SH EXIT CODES
The Xpdf tools use the following exit codes:
.TP
0
No error.
.TP
1
Error opening a PDF file.
.TP
2
Error opening an output file.
.TP
3
Error related to PDF permissions.
.TP
99
Other error.
.SH AUTHOR
The pdffonts software and documentation are copyright 1996-2004 Glyph
& Cog, LLC.
.SH "SEE ALSO"
.BR xpdf (1),
.BR pdftops (1),
.BR pdftotext (1),
.BR pdfinfo (1),
.BR pdftoppm (1),
.BR pdfimages (1),
.BR xpdfrc (5)
.br
.B http://www.foolabs.com/xpdf/
--- NEW FILE: pdffonts.cc ---
//========================================================================
//
// pdffonts.cc
//
// Copyright 2001-2003 Glyph & Cog, LLC
//
//========================================================================
#include <poppler-config.h>
#include <stdio.h>
#include <stdlib.h>
#include <stddef.h>
#include <string.h>
#include <math.h>
#include "parseargs.h"
#include "goo/GooString.h"
#include "goo/gmem.h"
#include "GlobalParams.h"
#include "Error.h"
#include "Object.h"
#include "Dict.h"
#include "GfxFont.h"
#include "Annot.h"
#include "PDFDoc.h"
#include "config.h"
static char *fontTypeNames[] = {
"unknown",
"Type 1",
"Type 1C",
"Type 3",
"TrueType",
"CID Type 0",
"CID Type 0C",
"CID TrueType"
};
static void scanFonts(Dict *resDict, PDFDoc *doc);
static void scanFont(GfxFont *font, PDFDoc *doc);
static int firstPage = 1;
static int lastPage = 0;
static char ownerPassword[33] = "\001";
static char userPassword[33] = "\001";
static char cfgFileName[256] = "";
static GBool printVersion = gFalse;
static GBool printHelp = gFalse;
static ArgDesc argDesc[] = {
{"-f", argInt, &firstPage, 0,
"first page to examine"},
{"-l", argInt, &lastPage, 0,
"last page to examine"},
{"-opw", argString, ownerPassword, sizeof(ownerPassword),
"owner password (for encrypted files)"},
{"-upw", argString, userPassword, sizeof(userPassword),
"user password (for encrypted files)"},
{"-cfg", argString, cfgFileName, sizeof(cfgFileName),
"configuration file to use in place of .xpdfrc"},
{"-v", argFlag, &printVersion, 0,
"print copyright and version info"},
{"-h", argFlag, &printHelp, 0,
"print usage information"},
{"-help", argFlag, &printHelp, 0,
"print usage information"},
{"--help", argFlag, &printHelp, 0,
"print usage information"},
{"-?", argFlag, &printHelp, 0,
"print usage information"},
{NULL}
};
static Ref *fonts;
static int fontsLen;
static int fontsSize;
int main(int argc, char *argv[]) {
PDFDoc *doc;
GooString *fileName;
GooString *ownerPW, *userPW;
GBool ok;
Page *page;
Dict *resDict;
Annots *annots;
Object obj1, obj2;
int pg, i;
int exitCode;
exitCode = 99;
// parse args
ok = parseArgs(argDesc, &argc, argv);
if (!ok || argc != 2 || printVersion || printHelp) {
fprintf(stderr, "pdffonts version %s\n", xpdfVersion);
fprintf(stderr, "%s\n", xpdfCopyright);
if (!printVersion) {
printUsage("pdffonts", "<PDF-file>", argDesc);
}
goto err0;
}
fileName = new GooString(argv[1]);
// read config file
globalParams = new GlobalParams(cfgFileName);
// open PDF file
if (ownerPassword[0] != '\001') {
ownerPW = new GooString(ownerPassword);
} else {
ownerPW = NULL;
}
if (userPassword[0] != '\001') {
userPW = new GooString(userPassword);
} else {
userPW = NULL;
}
doc = new PDFDoc(fileName, ownerPW, userPW);
if (userPW) {
delete userPW;
}
if (ownerPW) {
delete ownerPW;
}
if (!doc->isOk()) {
exitCode = 1;
goto err1;
}
// get page range
if (firstPage < 1) {
firstPage = 1;
}
if (lastPage < 1 || lastPage > doc->getNumPages()) {
lastPage = doc->getNumPages();
}
// scan the fonts
printf("name type emb sub uni object ID\n");
printf("------------------------------------ ------------ --- --- --- ---------\n");
fonts = NULL;
fontsLen = fontsSize = 0;
for (pg = firstPage; pg <= lastPage; ++pg) {
page = doc->getCatalog()->getPage(pg);
if ((resDict = page->getResourceDict())) {
scanFonts(resDict, doc);
}
annots = new Annots(doc->getXRef(),
doc->getCatalog(),
page->getAnnots(&obj1));
obj1.free();
for (i = 0; i < annots->getNumAnnots(); ++i) {
if (annots->getAnnot(i)->getAppearance(&obj1)->isStream()) {
obj1.streamGetDict()->lookup("Resources", &obj2);
if (obj2.isDict()) {
scanFonts(obj2.getDict(), doc);
}
obj2.free();
}
obj1.free();
}
delete annots;
}
exitCode = 0;
// clean up
gfree(fonts);
err1:
delete doc;
delete globalParams;
err0:
// check for memory leaks
Object::memCheck(stderr);
gMemReport(stderr);
return exitCode;
}
static void scanFonts(Dict *resDict, PDFDoc *doc) {
Object obj1, obj2, xObjDict, xObj, resObj;
Ref r;
GfxFontDict *gfxFontDict;
GfxFont *font;
int i;
// scan the fonts in this resource dictionary
gfxFontDict = NULL;
resDict->lookupNF("Font", &obj1);
if (obj1.isRef()) {
obj1.fetch(doc->getXRef(), &obj2);
if (obj2.isDict()) {
r = obj1.getRef();
gfxFontDict = new GfxFontDict(doc->getXRef(), &r, obj2.getDict());
}
obj2.free();
} else if (obj1.isDict()) {
gfxFontDict = new GfxFontDict(doc->getXRef(), NULL, obj1.getDict());
}
if (gfxFontDict) {
for (i = 0; i < gfxFontDict->getNumFonts(); ++i) {
if ((font = gfxFontDict->getFont(i))) {
scanFont(font, doc);
}
}
delete gfxFontDict;
}
obj1.free();
// recursively scan any resource dictionaries in objects in this
// resource dictionary
resDict->lookup("XObject", &xObjDict);
if (xObjDict.isDict()) {
for (i = 0; i < xObjDict.dictGetLength(); ++i) {
xObjDict.dictGetVal(i, &xObj);
if (xObj.isStream()) {
xObj.streamGetDict()->lookup("Resources", &resObj);
if (resObj.isDict()) {
scanFonts(resObj.getDict(), doc);
}
resObj.free();
}
xObj.free();
}
}
xObjDict.free();
}
static void scanFont(GfxFont *font, PDFDoc *doc) {
Ref fontRef, embRef;
Object fontObj, toUnicodeObj;
GooString *name;
GBool emb, subset, hasToUnicode;
int i;
fontRef = *font->getID();
// check for an already-seen font
for (i = 0; i < fontsLen; ++i) {
if (fontRef.num == fonts[i].num && fontRef.gen == fonts[i].gen) {
return;
}
}
// font name
name = font->getOrigName();
// check for an embedded font
if (font->getType() == fontType3) {
emb = gTrue;
} else {
emb = font->getEmbeddedFontID(&embRef);
}
// look for a ToUnicode map
hasToUnicode = gFalse;
if (doc->getXRef()->fetch(fontRef.num, fontRef.gen, &fontObj)->isDict()) {
hasToUnicode = fontObj.dictLookup("ToUnicode", &toUnicodeObj)->isStream();
toUnicodeObj.free();
}
fontObj.free();
// check for a font subset name: capital letters followed by a '+'
// sign
subset = gFalse;
if (name) {
for (i = 0; i < name->getLength(); ++i) {
if (name->getChar(i) < 'A' || name->getChar(i) > 'Z') {
break;
}
}
subset = i > 0 && i < name->getLength() && name->getChar(i) == '+';
}
// print the font info
printf("%-36s %-12s %-3s %-3s %-3s",
name ? name->getCString() : "[none]",
fontTypeNames[font->getType()],
emb ? "yes" : "no",
subset ? "yes" : "no",
hasToUnicode ? "yes" : "no");
if (fontRef.gen >= 100000) {
printf(" [none]\n");
} else {
printf(" %6d %2d\n", fontRef.num, fontRef.gen);
}
// add this font to the list
if (fontsLen == fontsSize) {
fontsSize += 32;
fonts = (Ref *)grealloc(fonts, fontsSize * sizeof(Ref));
}
fonts[fontsLen++] = *font->getID();
}
--- NEW FILE: pdfimages.1 ---
.\" Copyright 1998-2004 Glyph & Cog, LLC
.TH pdfimages 1 "22 January 2004"
.SH NAME
pdfimages \- Portable Document Format (PDF) image extractor
(version 3.00)
.SH SYNOPSIS
.B pdfimages
[options]
.I PDF-file image-root
.SH DESCRIPTION
.B Pdfimages
saves images from a Portable Document Format (PDF) file as Portable
Pixmap (PPM), Portable Bitmap (PBM), or JPEG files.
.PP
Pdfimages reads the PDF file
.IR PDF-file ,
scans one or more pages, and writes one PPM, PBM, or JPEG file for each image,
.IR image-root - nnn . xxx ,
where
.I nnn
is the image number and
.I xxx
is the image type (.ppm, .pbm, .jpg).
.SH CONFIGURATION FILE
Pdfimages reads a configuration file at startup. It first tries to
find the user's private config file, ~/.xpdfrc. If that doesn't
exist, it looks for a system-wide config file, /etc/xpdf/xpdfrc. See the
.BR xpdfrc (5)
man page for details.
.SH OPTIONS
Many of the following options can be set with configuration file
commands. These are listed in square brackets with the description of
the corresponding command line option.
.TP
.BI \-f " number"
Specifies the first page to scan.
.TP
.BI \-l " number"
Specifies the last page to scan.
.TP
.B \-j
Normally, all images are written as PBM (for monochrome images) or PPM
(for non-monochrome images) files. With this option, images in DCT
format are saved as JPEG files. All non-DCT images are saved in
PBM/PPM format as usual.
.TP
.BI \-opw " password"
Specify the owner password for the PDF file. Providing this will
bypass all security restrictions.
.TP
.BI \-upw " password"
Specify the user password for the PDF file.
.TP
.B \-q
Don't print any messages or errors.
.RB "[config file: " errQuiet ]
.TP
.B \-v
Print copyright and version information.
.TP
.B \-h
Print usage information.
.RB ( \-help
and
.B \-\-help
are equivalent.)
.SH EXIT CODES
The Xpdf tools use the following exit codes:
.TP
0
No error.
.TP
1
Error opening a PDF file.
.TP
2
Error opening an output file.
.TP
3
Error related to PDF permissions.
.TP
99
Other error.
.SH AUTHOR
The pdfimages software and documentation are copyright 1998-2004 Glyph
& Cog, LLC.
.SH "SEE ALSO"
.BR xpdf (1),
.BR pdftops (1),
.BR pdftotext (1),
.BR pdfinfo (1),
.BR pdffonts (1),
.BR pdftoppm (1),
.BR xpdfrc (5)
.br
.B http://www.foolabs.com/xpdf/
--- NEW FILE: pdfimages.cc ---
//========================================================================
//
// pdfimages.cc
//
// Copyright 1998-2003 Glyph & Cog, LLC
//
// Modified for Debian by Hamish Moffatt, 22 May 2002.
//
//========================================================================
#include <poppler-config.h>
#include <stdio.h>
#include <stdlib.h>
#include <stddef.h>
#include <string.h>
#include "parseargs.h"
#include "goo/GooString.h"
#include "goo/gmem.h"
#include "GlobalParams.h"
#include "Object.h"
#include "Stream.h"
#include "Array.h"
#include "Dict.h"
#include "XRef.h"
#include "Catalog.h"
#include "Page.h"
#include "PDFDoc.h"
#include "ImageOutputDev.h"
#include "Error.h"
#include "config.h"
static int firstPage = 1;
static int lastPage = 0;
static GBool dumpJPEG = gFalse;
static char ownerPassword[33] = "\001";
static char userPassword[33] = "\001";
static GBool quiet = gFalse;
static char cfgFileName[256] = "";
static GBool printVersion = gFalse;
static GBool printHelp = gFalse;
static ArgDesc argDesc[] = {
{"-f", argInt, &firstPage, 0,
"first page to convert"},
{"-l", argInt, &lastPage, 0,
"last page to convert"},
{"-j", argFlag, &dumpJPEG, 0,
"write JPEG images as JPEG files"},
{"-opw", argString, ownerPassword, sizeof(ownerPassword),
"owner password (for encrypted files)"},
{"-upw", argString, userPassword, sizeof(userPassword),
"user password (for encrypted files)"},
{"-q", argFlag, &quiet, 0,
"don't print any messages or errors"},
{"-cfg", argString, cfgFileName, sizeof(cfgFileName),
"configuration file to use in place of .xpdfrc"},
{"-v", argFlag, &printVersion, 0,
"print copyright and version info"},
{"-h", argFlag, &printHelp, 0,
"print usage information"},
{"-help", argFlag, &printHelp, 0,
"print usage information"},
{"--help", argFlag, &printHelp, 0,
"print usage information"},
{"-?", argFlag, &printHelp, 0,
"print usage information"},
{NULL}
};
int main(int argc, char *argv[]) {
PDFDoc *doc;
GooString *fileName;
char *imgRoot;
GooString *ownerPW, *userPW;
ImageOutputDev *imgOut;
GBool ok;
int exitCode;
exitCode = 99;
// parse args
ok = parseArgs(argDesc, &argc, argv);
if (!ok || argc != 3 || printVersion || printHelp) {
fprintf(stderr, "pdfimages version %s\n", xpdfVersion);
fprintf(stderr, "%s\n", xpdfCopyright);
if (!printVersion) {
printUsage("pdfimages", "<PDF-file> <image-root>", argDesc);
}
goto err0;
}
fileName = new GooString(argv[1]);
imgRoot = argv[2];
// read config file
globalParams = new GlobalParams(cfgFileName);
if (quiet) {
globalParams->setErrQuiet(quiet);
}
// open PDF file
if (ownerPassword[0] != '\001') {
ownerPW = new GooString(ownerPassword);
} else {
ownerPW = NULL;
}
if (userPassword[0] != '\001') {
userPW = new GooString(userPassword);
} else {
userPW = NULL;
}
doc = new PDFDoc(fileName, ownerPW, userPW);
if (userPW) {
delete userPW;
}
if (ownerPW) {
delete ownerPW;
}
if (!doc->isOk()) {
exitCode = 1;
goto err1;
}
// check for copy permission
#ifdef ENFORCE_PERMISSIONS
if (!doc->okToCopy()) {
error(-1, "Copying of images from this document is not allowed.");
exitCode = 3;
goto err1;
}
#endif
// get page range
if (firstPage < 1)
firstPage = 1;
if (lastPage < 1 || lastPage > doc->getNumPages())
lastPage = doc->getNumPages();
// write image files
imgOut = new ImageOutputDev(imgRoot, dumpJPEG);
if (imgOut->isOk()) {
doc->displayPages(imgOut, firstPage, lastPage, 72, 72, 0,
gTrue, gFalse, gFalse);
}
delete imgOut;
exitCode = 0;
// clean up
err1:
delete doc;
delete globalParams;
err0:
// check for memory leaks
Object::memCheck(stderr);
gMemReport(stderr);
return exitCode;
}
--- NEW FILE: pdfinfo.1 ---
.\" Copyright 1999-2004 Glyph & Cog, LLC
.TH pdfinfo 1 "22 January 2004"
.SH NAME
pdfinfo \- Portable Document Format (PDF) document information
extractor (version 3.00)
.SH SYNOPSIS
.B pdfinfo
[options]
.RI [ PDF-file ]
.SH DESCRIPTION
.B Pdfinfo
prints the contents of the \'Info' dictionary (plus some other useful
information) from a Portable Document Format (PDF) file.
.PP
The \'Info' dictionary contains the following values:
.PP
.RS
title
.RE
.RS
subject
.RE
.RS
keywords
.RE
.RS
author
.RE
.RS
creator
.RE
.RS
producer
.RE
.RS
creation date
.RE
.RS
modification date
.RE
.PP
In addition, the following information is printed:
.PP
.RS
tagged (yes/no)
.RE
.RS
page count
.RE
.RS
encrypted flag (yes/no)
.RE
.RS
print and copy permissions (if encrypted)
.RE
.RS
page size
.RE
.RS
file size
.RE
.RS
linearized (yes/no)
.RE
.RS
PDF version
.RE
.RS
metadata (only if requested)
.RE
.SH CONFIGURATION FILE
Pdfinfo reads a configuration file at startup. It first tries to find
the user's private config file, ~/.xpdfrc. If that doesn't exist, it
looks for a system-wide config file, /etc/xpdf/xpdfrc. See the
.BR xpdfrc (5)
man page for details.
.SH OPTIONS
Many of the following options can be set with configuration file
commands. These are listed in square brackets with the description of
the corresponding command line option.
.TP
.BI \-f " number"
Specifies the first page to examine. If multiple pages are requested
using the "-f" and "-l" options, the size of each requested page (and,
optionally, the bounding boxes for each requested page) are printed.
Otherwise, only page one is examined.
.TP
.BI \-l " number"
Specifies the last page to examine.
.TP
.B \-box
Prints the page box bounding boxes: MediaBox, CropBox, BleedBox,
TrimBox, and ArtBox.
.TP
.B \-meta
Prints document-level metadata. (This is the "Metadata" stream from
the PDF file's Catalog object.)
.TP
.BI \-enc " encoding-name"
Sets the encoding to use for text output. The
.I encoding\-name
must be defined with the unicodeMap command (see
.BR xpdfrc (5)).
This defaults to "Latin1" (which is a built-in encoding).
.RB "[config file: " textEncoding ]
.TP
.BI \-opw " password"
Specify the owner password for the PDF file. Providing this will
bypass all security restrictions.
.TP
.BI \-upw " password"
Specify the user password for the PDF file.
.TP
.BI \-cfg " config-file"
Read
.I config-file
in place of ~/.xpdfrc or the system-wide config file.
.TP
.B \-v
Print copyright and version information.
.TP
.B \-h
Print usage information.
.RB ( \-help
and
.B \-\-help
are equivalent.)
.SH EXIT CODES
The Xpdf tools use the following exit codes:
.TP
0
No error.
.TP
1
Error opening a PDF file.
.TP
2
Error opening an output file.
.TP
3
Error related to PDF permissions.
.TP
99
Other error.
.SH AUTHOR
The pdfinfo software and documentation are copyright 1996-2004 Glyph &
Cog, LLC.
.SH "SEE ALSO"
.BR xpdf (1),
.BR pdftops (1),
.BR pdftotext (1),
.BR pdffonts (1),
.BR pdftoppm (1),
.BR pdfimages (1),
.BR xpdfrc (5)
.br
.B http://www.foolabs.com/xpdf/
--- NEW FILE: pdfinfo.cc ---
//========================================================================
//
// pdfinfo.cc
//
// Copyright 1998-2003 Glyph & Cog, LLC
//
//========================================================================
#include <poppler-config.h>
#include <stdio.h>
#include <stdlib.h>
#include <stddef.h>
#include <string.h>
#include <time.h>
#include <math.h>
#include "parseargs.h"
#include "goo/GooString.h"
#include "goo/gmem.h"
#include "GlobalParams.h"
#include "Object.h"
#include "Stream.h"
#include "Array.h"
#include "Dict.h"
#include "XRef.h"
#include "Catalog.h"
#include "Page.h"
#include "PDFDoc.h"
#include "CharTypes.h"
#include "UnicodeMap.h"
#include "Error.h"
#include "config.h"
static void printInfoString(Dict *infoDict, char *key, char *text,
UnicodeMap *uMap);
static void printInfoDate(Dict *infoDict, char *key, char *text);
static void printBox(char *text, PDFRectangle *box);
static int firstPage = 1;
static int lastPage = 0;
static GBool printBoxes = gFalse;
static GBool printMetadata = gFalse;
static char textEncName[128] = "";
static char ownerPassword[33] = "\001";
static char userPassword[33] = "\001";
static char cfgFileName[256] = "";
static GBool printVersion = gFalse;
static GBool printHelp = gFalse;
static ArgDesc argDesc[] = {
{"-f", argInt, &firstPage, 0,
"first page to convert"},
{"-l", argInt, &lastPage, 0,
"last page to convert"},
{"-box", argFlag, &printBoxes, 0,
"print the page bounding boxes"},
{"-meta", argFlag, &printMetadata, 0,
"print the document metadata (XML)"},
{"-enc", argString, textEncName, sizeof(textEncName),
"output text encoding name"},
{"-opw", argString, ownerPassword, sizeof(ownerPassword),
"owner password (for encrypted files)"},
{"-upw", argString, userPassword, sizeof(userPassword),
"user password (for encrypted files)"},
{"-cfg", argString, cfgFileName, sizeof(cfgFileName),
"configuration file to use in place of .xpdfrc"},
{"-v", argFlag, &printVersion, 0,
"print copyright and version info"},
{"-h", argFlag, &printHelp, 0,
"print usage information"},
{"-help", argFlag, &printHelp, 0,
"print usage information"},
{"--help", argFlag, &printHelp, 0,
"print usage information"},
{"-?", argFlag, &printHelp, 0,
"print usage information"},
{NULL}
};
int main(int argc, char *argv[]) {
PDFDoc *doc;
GooString *fileName;
GooString *ownerPW, *userPW;
UnicodeMap *uMap;
Page *page;
Object info;
char buf[256];
double w, h, wISO, hISO;
FILE *f;
GooString *metadata;
GBool ok;
int exitCode;
int pg, i;
GBool multiPage;
exitCode = 99;
// parse args
ok = parseArgs(argDesc, &argc, argv);
if (!ok || argc != 2 || printVersion || printHelp) {
fprintf(stderr, "pdfinfo version %s\n", xpdfVersion);
fprintf(stderr, "%s\n", xpdfCopyright);
if (!printVersion) {
printUsage("pdfinfo", "<PDF-file>", argDesc);
}
goto err0;
}
fileName = new GooString(argv[1]);
// read config file
globalParams = new GlobalParams(cfgFileName);
if (textEncName[0]) {
globalParams->setTextEncoding(textEncName);
}
// get mapping to output encoding
if (!(uMap = globalParams->getTextEncoding())) {
error(-1, "Couldn't get text encoding");
delete fileName;
goto err1;
}
// open PDF file
if (ownerPassword[0] != '\001') {
ownerPW = new GooString(ownerPassword);
} else {
ownerPW = NULL;
}
if (userPassword[0] != '\001') {
userPW = new GooString(userPassword);
} else {
userPW = NULL;
}
doc = new PDFDoc(fileName, ownerPW, userPW);
if (userPW) {
delete userPW;
}
if (ownerPW) {
delete ownerPW;
}
if (!doc->isOk()) {
exitCode = 1;
goto err2;
}
// get page range
if (firstPage < 1) {
firstPage = 1;
}
if (lastPage == 0) {
multiPage = gFalse;
lastPage = 1;
} else {
multiPage = gTrue;
}
if (lastPage < 1 || lastPage > doc->getNumPages()) {
lastPage = doc->getNumPages();
}
// print doc info
doc->getDocInfo(&info);
if (info.isDict()) {
printInfoString(info.getDict(), "Title", "Title: ", uMap);
printInfoString(info.getDict(), "Subject", "Subject: ", uMap);
printInfoString(info.getDict(), "Keywords", "Keywords: ", uMap);
printInfoString(info.getDict(), "Author", "Author: ", uMap);
printInfoString(info.getDict(), "Creator", "Creator: ", uMap);
printInfoString(info.getDict(), "Producer", "Producer: ", uMap);
printInfoDate(info.getDict(), "CreationDate", "CreationDate: ");
printInfoDate(info.getDict(), "ModDate", "ModDate: ");
}
info.free();
// print tagging info
printf("Tagged: %s\n",
doc->getStructTreeRoot()->isDict() ? "yes" : "no");
// print page count
printf("Pages: %d\n", doc->getNumPages());
// print encryption info
printf("Encrypted: ");
if (doc->isEncrypted()) {
printf("yes (print:%s copy:%s change:%s addNotes:%s)\n",
doc->okToPrint(gTrue) ? "yes" : "no",
doc->okToCopy(gTrue) ? "yes" : "no",
doc->okToChange(gTrue) ? "yes" : "no",
doc->okToAddNotes(gTrue) ? "yes" : "no");
} else {
printf("no\n");
}
// print page size
for (pg = firstPage; pg <= lastPage; ++pg) {
w = doc->getPageMediaWidth(pg);
h = doc->getPageMediaHeight(pg);
if (multiPage) {
printf("Page %4d size: %g x %g pts", pg, w, h);
} else {
printf("Page size: %g x %g pts", w, h);
}
if ((fabs(w - 612) < 0.1 && fabs(h - 792) < 0.1) ||
(fabs(w - 792) < 0.1 && fabs(h - 612) < 0.1)) {
printf(" (letter)");
} else {
hISO = sqrt(sqrt(2.0)) * 7200 / 2.54;
wISO = hISO / sqrt(2.0);
for (i = 0; i <= 6; ++i) {
if ((fabs(w - wISO) < 1 && fabs(h - hISO) < 1) ||
(fabs(w - hISO) < 1 && fabs(h - wISO) < 1)) {
printf(" (A%d)", i);
break;
}
hISO = wISO;
wISO /= sqrt(2.0);
}
}
printf("\n");
}
// print the boxes
if (printBoxes) {
if (multiPage) {
for (pg = firstPage; pg <= lastPage; ++pg) {
page = doc->getCatalog()->getPage(pg);
sprintf(buf, "Page %4d MediaBox: ", pg);
printBox(buf, page->getMediaBox());
sprintf(buf, "Page %4d CropBox: ", pg);
printBox(buf, page->getCropBox());
sprintf(buf, "Page %4d BleedBox: ", pg);
printBox(buf, page->getBleedBox());
sprintf(buf, "Page %4d TrimBox: ", pg);
printBox(buf, page->getTrimBox());
sprintf(buf, "Page %4d ArtBox: ", pg);
printBox(buf, page->getArtBox());
}
} else {
page = doc->getCatalog()->getPage(firstPage);
printBox("MediaBox: ", page->getMediaBox());
printBox("CropBox: ", page->getCropBox());
printBox("BleedBox: ", page->getBleedBox());
printBox("TrimBox: ", page->getTrimBox());
printBox("ArtBox: ", page->getArtBox());
}
}
// print file size
#ifdef VMS
f = fopen(fileName->getCString(), "rb", "ctx=stm");
#else
f = fopen(fileName->getCString(), "rb");
#endif
if (f) {
#if HAVE_FSEEKO
fseeko(f, 0, SEEK_END);
printf("File size: %u bytes\n", (Guint)ftello(f));
#elif HAVE_FSEEK64
fseek64(f, 0, SEEK_END);
printf("File size: %u bytes\n", (Guint)ftell64(f));
#else
fseek(f, 0, SEEK_END);
printf("File size: %d bytes\n", (int)ftell(f));
#endif
fclose(f);
}
// print linearization info
printf("Optimized: %s\n", doc->isLinearized() ? "yes" : "no");
// print PDF version
printf("PDF version: %.1f\n", doc->getPDFVersion());
// print the metadata
if (printMetadata && (metadata = doc->readMetadata())) {
fputs("Metadata:\n", stdout);
fputs(metadata->getCString(), stdout);
fputc('\n', stdout);
delete metadata;
}
exitCode = 0;
// clean up
err2:
uMap->decRefCnt();
delete doc;
err1:
delete globalParams;
err0:
// check for memory leaks
Object::memCheck(stderr);
gMemReport(stderr);
return exitCode;
}
static void printInfoString(Dict *infoDict, char *key, char *text,
UnicodeMap *uMap) {
Object obj;
GooString *s1;
GBool isUnicode;
Unicode u;
char buf[8];
int i, n;
if (infoDict->lookup(key, &obj)->isString()) {
fputs(text, stdout);
s1 = obj.getString();
if ((s1->getChar(0) & 0xff) == 0xfe &&
(s1->getChar(1) & 0xff) == 0xff) {
isUnicode = gTrue;
i = 2;
} else {
isUnicode = gFalse;
i = 0;
}
while (i < obj.getString()->getLength()) {
if (isUnicode) {
u = ((s1->getChar(i) & 0xff) << 8) |
(s1->getChar(i+1) & 0xff);
i += 2;
} else {
u = s1->getChar(i) & 0xff;
++i;
}
n = uMap->mapUnicode(u, buf, sizeof(buf));
fwrite(buf, 1, n, stdout);
}
fputc('\n', stdout);
}
obj.free();
}
static void printInfoDate(Dict *infoDict, char *key, char *text) {
Object obj;
char *s;
int year, mon, day, hour, min, sec;
struct tm tmStruct;
char buf[256];
if (infoDict->lookup(key, &obj)->isString()) {
fputs(text, stdout);
s = obj.getString()->getCString();
if (s[0] == 'D' && s[1] == ':') {
s += 2;
}
if (sscanf(s, "%4d%2d%2d%2d%2d%2d",
&year, &mon, &day, &hour, &min, &sec) == 6) {
tmStruct.tm_year = year - 1900;
tmStruct.tm_mon = mon - 1;
tmStruct.tm_mday = day;
tmStruct.tm_hour = hour;
tmStruct.tm_min = min;
tmStruct.tm_sec = sec;
tmStruct.tm_wday = -1;
tmStruct.tm_yday = -1;
tmStruct.tm_isdst = -1;
// compute the tm_wday and tm_yday fields
if (mktime(&tmStruct) != (time_t)-1 &&
strftime(buf, sizeof(buf), "%c", &tmStruct)) {
fputs(buf, stdout);
} else {
fputs(s, stdout);
}
} else {
fputs(s, stdout);
}
fputc('\n', stdout);
}
obj.free();
}
static void printBox(char *text, PDFRectangle *box) {
printf("%s%8.2f %8.2f %8.2f %8.2f\n",
text, box->x1, box->y1, box->x2, box->y2);
}
--- NEW FILE: pdftohtml.1 ---
.TH PDFTOHTML 1
.\" NAME should be all caps, SECTION should be 1-8, maybe w/ subsection
.\" other parms are allowed: see man(7), man(1)
.SH NAME
pdftohtml \- program to convert pdf files into html, xml and png images
.SH SYNOPSIS
.B pdftohtml
.I "[options] <PDF-file> [<html-file> <xml-file>]"
.SH "DESCRIPTION"
This manual page documents briefly the
.BR pdftohtml
command.
This manual page was written for the Debian GNU/Linux distribution
because the original program does not have a manual page.
.PP
.B pdftohtml
is a program that converts pdf documents into html. It generates its output in
the current working directory.
.SH OPTIONS
A summary of options are included below.
.TP
.B \-h, \-help
Show summary of options.
.TP
.B \-f <int>
first page to print
.TP
.B \-l <int>
last page to print
.TP
.B \-q
dont print any messages or errors
.TP
.B \-v
print copyright and version info
.TP
.B \-p
exchange .pdf links with .html
.TP
.B \-c
generate complex output
.TP
.B \-i
ignore images
.TP
.B \-noframes
generate no frames. Not supported in complex output mode.
.TP
.B \-stdout
use standard output
.TP
.B \-zoom <fp>
zoom the pdf document (default 1.5)
.TP
.B \-xml
output for XML post-processing
.TP
.B \-enc <string>
output text encoding name
.TP
.B \-opw <string>
owner password (for encrypted files)
.TP
.B \-upw <string>
user password (for encrypted files)
.TP
.B \-hidden
force hidden text extraction
.TP
.B \-dev
output device name for Ghostscript (png16m, jpeg etc)
.TP
.B \-nomerge
do not merge paragraphs
.TP
.B \-nodrm
override document DRM settings
.SH AUTHOR
Pdftohtml was developed by Gueorgui Ovtcharov and Rainer Dorsch. It is
based and benefits a lot from Derek Noonburg's xpdf package.
This manual page was written by Søren Boll Overgaard <boll at debian.org>,
for the Debian GNU/Linux system (but may be used by others).
--- NEW FILE: pdftohtml.cc ---
//========================================================================
//
// pdftohtml.cc
//
//
// Copyright 1999-2000 G. Ovtcharov
//========================================================================
#include <stdio.h>
#include <stdlib.h>
#include <stddef.h>
#include <string.h>
#include <dirent.h>
#include <poppler-config.h>
#include <time.h>
#include "parseargs.h"
#include "goo/GooString.h"
#include "goo/gmem.h"
#include "Object.h"
#include "Stream.h"
#include "Array.h"
#include "Dict.h"
#include "XRef.h"
#include "Catalog.h"
#include "Page.h"
#include "PDFDoc.h"
#include "HtmlOutputDev.h"
#include "PSOutputDev.h"
#include "GlobalParams.h"
#include "Error.h"
#include "config.h"
#include "goo/gfile.h"
#ifndef GHOSTSCRIPT
# define GHOSTSCRIPT "gs"
#endif
static int firstPage = 1;
static int lastPage = 0;
static GBool rawOrder = gTrue;
GBool printCommands = gTrue;
static GBool printHelp = gFalse;
GBool printHtml = gFalse;
GBool complexMode=gFalse;
GBool ignore=gFalse;
//char extension[5]=".png";
double scale=1.5;
GBool noframes=gFalse;
GBool stout=gFalse;
GBool xml=gFalse;
GBool errQuiet=gFalse;
GBool noDrm=gFalse;
GBool showHidden = gFalse;
GBool noMerge = gFalse;
static char ownerPassword[33] = "";
static char userPassword[33] = "";
static char gsDevice[33] = "png16m";
static GBool printVersion = gFalse;
static GooString* getInfoString(Dict *infoDict, char *key);
static GooString* getInfoDate(Dict *infoDict, char *key);
static char textEncName[128] = "";
static ArgDesc argDesc[] = {
{"-f", argInt, &firstPage, 0,
"first page to convert"},
{"-l", argInt, &lastPage, 0,
"last page to convert"},
/*{"-raw", argFlag, &rawOrder, 0,
"keep strings in content stream order"},*/
{"-q", argFlag, &errQuiet, 0,
"don't print any messages or errors"},
{"-h", argFlag, &printHelp, 0,
"print usage information"},
{"-help", argFlag, &printHelp, 0,
"print usage information"},
{"-p", argFlag, &printHtml, 0,
"exchange .pdf links by .html"},
{"-c", argFlag, &complexMode, 0,
"generate complex document"},
{"-i", argFlag, &ignore, 0,
"ignore images"},
{"-noframes", argFlag, &noframes, 0,
"generate no frames"},
{"-stdout" ,argFlag, &stout, 0,
"use standard output"},
{"-zoom", argFP, &scale, 0,
"zoom the pdf document (default 1.5)"},
{"-xml", argFlag, &xml, 0,
"output for XML post-processing"},
{"-hidden", argFlag, &showHidden, 0,
"output hidden text"},
{"-nomerge", argFlag, &noMerge, 0,
"do not merge paragraphs"},
{"-enc", argString, textEncName, sizeof(textEncName),
"output text encoding name"},
{"-dev", argString, gsDevice, sizeof(gsDevice),
"output device name for Ghostscript (png16m, jpeg etc)"},
{"-v", argFlag, &printVersion, 0,
"print copyright and version info"},
{"-opw", argString, ownerPassword, sizeof(ownerPassword),
"owner password (for encrypted files)"},
{"-upw", argString, userPassword, sizeof(userPassword),
"user password (for encrypted files)"},
{"-nodrm", argFlag, &noDrm, 0,
"override document DRM settings"},
{NULL}
};
int main(int argc, char *argv[]) {
PDFDoc *doc = NULL;
GooString *fileName = NULL;
GooString *docTitle = NULL;
GooString *author = NULL, *keywords = NULL, *subject = NULL, *date = NULL;
GooString *htmlFileName = NULL;
GooString *psFileName = NULL;
HtmlOutputDev *htmlOut = NULL;
PSOutputDev *psOut = NULL;
GBool ok;
char *p;
char extension[16] = "png";
GooString *ownerPW, *userPW;
Object info;
char * extsList[] = {"png", "jpeg", "bmp", "pcx", "tiff", "pbm", NULL};
// parse args
ok = parseArgs(argDesc, &argc, argv);
if (!ok || argc < 2 || argc > 3 || printHelp || printVersion) {
fprintf(stderr, "pdftohtml version %s http://pdftohtml.sourceforge.net/, based on Xpdf version %s\n", "0.36", xpdfVersion);
fprintf(stderr, "%s\n", "Copyright 1999-2003 Gueorgui Ovtcharov and Rainer Dorsch");
fprintf(stderr, "%s\n\n", xpdfCopyright);
if (!printVersion) {
printUsage("pdftohtml", "<PDF-file> [<html-file> <xml-file>]", argDesc);
}
exit(1);
}
// init error file
//errorInit();
// read config file
globalParams = new GlobalParams("");
if (errQuiet) {
globalParams->setErrQuiet(errQuiet);
printCommands = gFalse; // I'm not 100% what is the differecne between them
}
if (textEncName[0]) {
globalParams->setTextEncoding(textEncName);
if( !globalParams->getTextEncoding() ) {
goto error;
}
}
// open PDF file
if (ownerPassword[0]) {
ownerPW = new GooString(ownerPassword);
} else {
ownerPW = NULL;
}
if (userPassword[0]) {
userPW = new GooString(userPassword);
} else {
userPW = NULL;
}
fileName = new GooString(argv[1]);
doc = new PDFDoc(fileName, ownerPW, userPW);
if (userPW) {
delete userPW;
}
if (ownerPW) {
delete ownerPW;
}
if (!doc->isOk()) {
goto error;
}
// check for copy permission
if (!doc->okToCopy()) {
if (!noDrm) {
error(-1, "Copying of text from this document is not allowed.");
goto error;
}
fprintf(stderr, "Document has copy-protection bit set.\n");
}
// construct text file name
if (argc == 3) {
GooString* tmp = new GooString(argv[2]);
p=tmp->getCString()+tmp->getLength()-5;
if (!xml)
if (!strcmp(p, ".html") || !strcmp(p, ".HTML"))
htmlFileName = new GooString(tmp->getCString(),
tmp->getLength() - 5);
else htmlFileName =new GooString(tmp);
else
if (!strcmp(p, ".xml") || !strcmp(p, ".XML"))
htmlFileName = new GooString(tmp->getCString(),
tmp->getLength() - 5);
else htmlFileName =new GooString(tmp);
delete tmp;
} else {
p = fileName->getCString() + fileName->getLength() - 4;
if (!strcmp(p, ".pdf") || !strcmp(p, ".PDF"))
htmlFileName = new GooString(fileName->getCString(),
fileName->getLength() - 4);
else
htmlFileName = fileName->copy();
// htmlFileName->append(".html");
}
if (scale>3.0) scale=3.0;
if (scale<0.5) scale=0.5;
if (complexMode) {
//noframes=gFalse;
stout=gFalse;
}
if (stout) {
noframes=gTrue;
complexMode=gFalse;
}
if (xml)
{
complexMode = gTrue;
noframes = gTrue;
noMerge = gTrue;
}
// get page range
if (firstPage < 1)
firstPage = 1;
if (lastPage < 1 || lastPage > doc->getNumPages())
lastPage = doc->getNumPages();
doc->getDocInfo(&info);
if (info.isDict()) {
docTitle = getInfoString(info.getDict(), "Title");
author = getInfoString(info.getDict(), "Author");
keywords = getInfoString(info.getDict(), "Keywords");
subject = getInfoString(info.getDict(), "Subject");
date = getInfoDate(info.getDict(), "ModDate");
if( !date )
date = getInfoDate(info.getDict(), "CreationDate");
}
info.free();
if( !docTitle ) docTitle = new GooString(htmlFileName);
/* determine extensions of output backgroun images */
{int i;
for(i = 0; extsList[i]; i++)
{
if( strstr(gsDevice, extsList[i]) != (char *) NULL )
{
strncpy(extension, extsList[i], sizeof(extension));
break;
}
}}
rawOrder = complexMode; // todo: figure out what exactly rawOrder do :)
// write text file
htmlOut = new HtmlOutputDev(htmlFileName->getCString(),
docTitle->getCString(),
author ? author->getCString() : NULL,
keywords ? keywords->getCString() : NULL,
subject ? subject->getCString() : NULL,
date ? date->getCString() : NULL,
extension,
rawOrder,
firstPage,
doc->getCatalog()->getOutline()->isDict());
delete docTitle;
if( author )
{
delete author;
}
if( keywords )
{
delete keywords;
}
if( subject )
{
delete subject;
}
if( date )
{
delete date;
}
if (htmlOut->isOk())
{
doc->displayPages(htmlOut, firstPage, lastPage, 72, 72, 0,
gTrue, gFalse, gFalse);
if (!xml)
{
htmlOut->dumpDocOutline(doc->getCatalog());
}
}
if( complexMode && !xml && !ignore ) {
int h=xoutRound(htmlOut->getPageHeight()/scale);
int w=xoutRound(htmlOut->getPageWidth()/scale);
//int h=xoutRound(doc->getPageHeight(1)/scale);
//int w=xoutRound(doc->getPageWidth(1)/scale);
psFileName = new GooString(htmlFileName->getCString());
psFileName->append(".ps");
globalParams->setPSPaperWidth(w);
globalParams->setPSPaperHeight(h);
// XXX
// globalParams->setPSNoText(gTrue);
psOut = new PSOutputDev(psFileName->getCString(), doc->getXRef(),
doc->getCatalog(), firstPage, lastPage, psModePS);
doc->displayPages(psOut, firstPage, lastPage, 72, 72, 0,
gTrue, gFalse, gFalse);
delete psOut;
/*sprintf(buf, "%s -sDEVICE=png16m -dBATCH -dNOPROMPT -dNOPAUSE -r72 -sOutputFile=%s%%03d.png -g%dx%d -q %s", GHOSTSCRIPT, htmlFileName->getCString(), w, h,
psFileName->getCString());*/
GooString *gsCmd = new GooString(GHOSTSCRIPT);
GooString *tw, *th, *sc;
gsCmd->append(" -sDEVICE=");
gsCmd->append(gsDevice);
gsCmd->append(" -dBATCH -dNOPROMPT -dNOPAUSE -r");
sc = GooString::fromInt(static_cast<int>(72*scale));
gsCmd->append(sc);
gsCmd->append(" -sOutputFile=");
gsCmd->append("\"");
gsCmd->append(htmlFileName);
gsCmd->append("%03d.");
gsCmd->append(extension);
gsCmd->append("\" -g");
tw = GooString::fromInt(static_cast<int>(scale*w));
gsCmd->append(tw);
gsCmd->append("x");
th = GooString::fromInt(static_cast<int>(scale*h));
gsCmd->append(th);
gsCmd->append(" -q \"");
gsCmd->append(psFileName);
gsCmd->append("\"");
// printf("running: %s\n", gsCmd->getCString());
if( !executeCommand(gsCmd->getCString()) && !errQuiet) {
error(-1, "Failed to launch Ghostscript!\n");
}
unlink(psFileName->getCString());
delete tw;
delete th;
delete sc;
delete gsCmd;
delete psFileName;
}
delete htmlOut;
// clean up
error:
if(doc) delete doc;
if(globalParams) delete globalParams;
if(htmlFileName) delete htmlFileName;
HtmlFont::clear();
// check for memory leaks
Object::memCheck(stderr);
gMemReport(stderr);
return 0;
}
static GooString* getInfoString(Dict *infoDict, char *key) {
Object obj;
GooString *s1 = NULL;
if (infoDict->lookup(key, &obj)->isString()) {
s1 = new GooString(obj.getString());
}
obj.free();
return s1;
}
static GooString* getInfoDate(Dict *infoDict, char *key) {
Object obj;
char *s;
int year, mon, day, hour, min, sec;
struct tm tmStruct;
GooString *result = NULL;
char buf[256];
if (infoDict->lookup(key, &obj)->isString()) {
s = obj.getString()->getCString();
if (s[0] == 'D' && s[1] == ':') {
s += 2;
}
if (sscanf(s, "%4d%2d%2d%2d%2d%2d",
&year, &mon, &day, &hour, &min, &sec) == 6) {
tmStruct.tm_year = year - 1900;
tmStruct.tm_mon = mon - 1;
tmStruct.tm_mday = day;
tmStruct.tm_hour = hour;
tmStruct.tm_min = min;
tmStruct.tm_sec = sec;
tmStruct.tm_wday = -1;
tmStruct.tm_yday = -1;
tmStruct.tm_isdst = -1;
mktime(&tmStruct); // compute the tm_wday and tm_yday fields
if (strftime(buf, sizeof(buf), "%Y-%m-%dT%H:%M:%S+00:00", &tmStruct)) {
result = new GooString(buf);
} else {
result = new GooString(s);
}
} else {
result = new GooString(s);
}
}
obj.free();
return result;
}
--- NEW FILE: pdftoppm.1 ---
.\" Copyright 2004 Glyph & Cog, LLC
.TH pdftoppm 1 "22 January 2004"
.SH NAME
pdftoppm \- Portable Document Format (PDF) to Portable Pixmap (PPM)
converter (version 3.00)
.SH SYNOPSIS
.B pdftoppm
[options]
.I PDF-file PPM-root
.SH DESCRIPTION
.B Pdftoppm
converts Portable Document Format (PDF) files to color image files in
Portable Pixmap (PPM) format, grayscale image files in Portable
Graymap (PGM) format, or monochrome image files in Portable Bitmap
(PBM) format.
.PP
Pdftoppm reads the PDF file,
.IR PDF-file ,
and writes one PPM file for each page,
.IR PPM-root - nnnnnn .ppm,
where
.I nnnnnn
is the page number.
.SH CONFIGURATION FILE
Pdftoppm reads a configuration file at startup. It first tries to
find the user's private config file, ~/.xpdfrc. If that doesn't
exist, it looks for a system-wide config file, /etc/xpdf/xpdfrc. See the
.BR xpdfrc (5)
man page for details.
.SH OPTIONS
Many of the following options can be set with configuration file
commands. These are listed in square brackets with the description of
the corresponding command line option.
.TP
.BI \-f " number"
Specifies the first page to convert.
.TP
.BI \-l " number"
Specifies the last page to convert.
.TP
.BI \-r " number"
Specifies the resolution, in DPI. The default is 150 DPI.
.TP
.B \-mono
Generate a monochrome PBM file (instead of a color PPM file).
.TP
.B \-gray
Generate a grayscale PGM file (instead of a color PPM file).
.TP
.BI \-t1lib " yes | no"
Enable or disable t1lib (a Type 1 font rasterizer). This defaults to
"yes".
.RB "[config file: " enableT1lib ]
.TP
.BI \-freetype " yes | no"
Enable or disable FreeType (a TrueType / Type 1 font rasterizer).
This defaults to "yes".
.RB "[config file: " enableFreeType ]
.TP
.BI \-aa " yes | no"
Enable or disable font anti-aliasing. This defaults to "yes".
.RB "[config file: " antialias ]
.TP
.BI \-opw " password"
Specify the owner password for the PDF file. Providing this will
bypass all security restrictions.
.TP
.BI \-upw " password"
Specify the user password for the PDF file.
.TP
.B \-q
Don't print any messages or errors.
.RB "[config file: " errQuiet ]
.TP
.B \-v
Print copyright and version information.
.TP
.B \-h
Print usage information.
.RB ( \-help
and
.B \-\-help
are equivalent.)
.SH EXIT CODES
The Xpdf tools use the following exit codes:
.TP
0
No error.
.TP
1
Error opening a PDF file.
.TP
2
Error opening an output file.
.TP
3
Error related to PDF permissions.
.TP
99
Other error.
.SH AUTHOR
The pdftoppm software and documentation are copyright 1996-2004 Glyph
& Cog, LLC.
.SH "SEE ALSO"
.BR xpdf (1),
.BR pdftops (1),
.BR pdftotext (1),
.BR pdfinfo (1),
.BR pdffonts (1),
.BR pdfimages (1),
.BR xpdfrc (5)
.br
.B http://www.foolabs.com/xpdf/
--- NEW FILE: pdftoppm.cc ---
//========================================================================
//
// pdftoppm.cc
//
// Copyright 2003 Glyph & Cog, LLC
//
//========================================================================
#include <poppler-config.h>
#include <stdio.h>
#include "parseargs.h"
#include "goo/gmem.h"
#include "goo/GooString.h"
#include "GlobalParams.h"
#include "Object.h"
#include "PDFDoc.h"
#include "splash/SplashBitmap.h"
#include "splash/Splash.h"
#include "SplashOutputDev.h"
#include "config.h"
static int firstPage = 1;
static int lastPage = 0;
static int resolution = 150;
static GBool mono = gFalse;
static GBool gray = gFalse;
static char enableT1libStr[16] = "";
static char enableFreeTypeStr[16] = "";
static char antialiasStr[16] = "";
static char ownerPassword[33] = "";
static char userPassword[33] = "";
static GBool quiet = gFalse;
static char cfgFileName[256] = "";
static GBool printVersion = gFalse;
static GBool printHelp = gFalse;
static ArgDesc argDesc[] = {
{"-f", argInt, &firstPage, 0,
"first page to print"},
{"-l", argInt, &lastPage, 0,
"last page to print"},
{"-r", argInt, &resolution, 0,
"resolution, in DPI (default is 150)"},
{"-mono", argFlag, &mono, 0,
"generate a monochrome PBM file"},
{"-gray", argFlag, &gray, 0,
"generate a grayscale PGM file"},
#if HAVE_T1LIB_H
{"-t1lib", argString, enableT1libStr, sizeof(enableT1libStr),
"enable t1lib font rasterizer: yes, no"},
#endif
#if HAVE_FREETYPE_FREETYPE_H | HAVE_FREETYPE_H
{"-freetype", argString, enableFreeTypeStr, sizeof(enableFreeTypeStr),
"enable FreeType font rasterizer: yes, no"},
#endif
{"-aa", argString, antialiasStr, sizeof(antialiasStr),
"enable font anti-aliasing: yes, no"},
{"-opw", argString, ownerPassword, sizeof(ownerPassword),
"owner password (for encrypted files)"},
{"-upw", argString, userPassword, sizeof(userPassword),
"user password (for encrypted files)"},
{"-q", argFlag, &quiet, 0,
"don't print any messages or errors"},
{"-cfg", argString, cfgFileName, sizeof(cfgFileName),
"configuration file to use in place of .xpdfrc"},
{"-v", argFlag, &printVersion, 0,
"print copyright and version info"},
{"-h", argFlag, &printHelp, 0,
"print usage information"},
{"-help", argFlag, &printHelp, 0,
"print usage information"},
{"--help", argFlag, &printHelp, 0,
"print usage information"},
{"-?", argFlag, &printHelp, 0,
"print usage information"},
{NULL}
};
int main(int argc, char *argv[]) {
PDFDoc *doc;
GooString *fileName;
char *ppmRoot;
char ppmFile[512];
GooString *ownerPW, *userPW;
SplashColor paperColor;
SplashOutputDev *splashOut;
GBool ok;
int exitCode;
int pg;
exitCode = 99;
// parse args
ok = parseArgs(argDesc, &argc, argv);
if (mono && gray) {
ok = gFalse;
}
if (!ok || argc != 3 || printVersion || printHelp) {
fprintf(stderr, "pdftoppm version %s\n", xpdfVersion);
fprintf(stderr, "%s\n", xpdfCopyright);
if (!printVersion) {
printUsage("pdftoppm", "<PDF-file> <PPM-root>", argDesc);
}
goto err0;
}
fileName = new GooString(argv[1]);
ppmRoot = argv[2];
// read config file
globalParams = new GlobalParams(cfgFileName);
globalParams->setupBaseFonts(NULL);
if (enableT1libStr[0]) {
if (!globalParams->setEnableT1lib(enableT1libStr)) {
fprintf(stderr, "Bad '-t1lib' value on command line\n");
}
}
if (enableFreeTypeStr[0]) {
if (!globalParams->setEnableFreeType(enableFreeTypeStr)) {
fprintf(stderr, "Bad '-freetype' value on command line\n");
}
}
if (antialiasStr[0]) {
if (!globalParams->setAntialias(antialiasStr)) {
fprintf(stderr, "Bad '-aa' value on command line\n");
}
}
if (quiet) {
globalParams->setErrQuiet(quiet);
}
// open PDF file
if (ownerPassword[0]) {
ownerPW = new GooString(ownerPassword);
} else {
ownerPW = NULL;
}
if (userPassword[0]) {
userPW = new GooString(userPassword);
} else {
userPW = NULL;
}
doc = new PDFDoc(fileName, ownerPW, userPW);
if (userPW) {
delete userPW;
}
if (ownerPW) {
delete ownerPW;
}
if (!doc->isOk()) {
exitCode = 1;
goto err1;
}
// get page range
if (firstPage < 1)
firstPage = 1;
if (lastPage < 1 || lastPage > doc->getNumPages())
lastPage = doc->getNumPages();
// write PPM files
paperColor.rgb8 = splashMakeRGB8(255, 255, 255);
splashOut = new SplashOutputDev(mono ? splashModeMono1 :
gray ? splashModeMono8 :
splashModeRGB8,
gFalse, paperColor);
splashOut->startDoc(doc->getXRef());
for (pg = firstPage; pg <= lastPage; ++pg) {
doc->displayPage(splashOut, pg, resolution, resolution, 0, gTrue, gFalse);
sprintf(ppmFile, "%.*s-%06d.%s",
(int)sizeof(ppmFile) - 32, ppmRoot, pg,
mono ? "pbm" : gray ? "pgm" : "ppm");
splashOut->getBitmap()->writePNMFile(ppmFile);
}
delete splashOut;
exitCode = 0;
// clean up
err1:
delete doc;
delete globalParams;
err0:
// check for memory leaks
Object::memCheck(stderr);
gMemReport(stderr);
return exitCode;
}
--- NEW FILE: pdftops.1 ---
.\" Copyright 1996-2004 Glyph & Cog, LLC
.TH pdftops 1 "22 January 2004"
.SH NAME
pdftops \- Portable Document Format (PDF) to PostScript converter
(version 3.00)
.SH SYNOPSIS
.B pdftops
[options]
.RI [ PDF-file
.RI [ PS-file ]]
.SH DESCRIPTION
.B Pdftops
converts Portable Document Format (PDF) files to PostScript so they
can be printed.
.PP
Pdftops reads the PDF file,
.IR PDF-file ,
and writes a PostScript file,
.IR PS-file .
If
.I PS-file
is not specified, pdftops converts
.I file.pdf
to
.I file.ps
(or
.I file.eps
with the -eps option). If
.I PS-file
is \'-', the PostScript is sent to stdout.
.SH CONFIGURATION FILE
Pdftops reads a configuration file at startup. It first tries to find
the user's private config file, ~/.xpdfrc. If that doesn't exist, it
looks for a system-wide config file, /etc/xpdf/xpdfrc. See the
.BR xpdfrc (5)
man page for details.
.SH OPTIONS
Many of the following options can be set with configuration file
commands. These are listed in square brackets with the description of
the corresponding command line option.
.TP
.BI \-f " number"
Specifies the first page to print.
.TP
.BI \-l " number"
Specifies the last page to print.
.TP
.B \-level1
Generate Level 1 PostScript. The resulting PostScript files will be
significantly larger (if they contain images), but will print on Level
1 printers. This also converts all images to black and white. No
more than one of the PostScript level options (-level1, -level1sep,
-level2, -level2sep, -level3, -level3Sep) may be given.
.RB "[config file: " psLevel ]
.TP
.B \-level1sep
Generate Level 1 separable PostScript. All colors are converted to
CMYK. Images are written with separate stream data for the four
components.
.RB "[config file: " psLevel ]
.TP
.B \-level2
Generate Level 2 PostScript. Level 2 supports color images and image
compression. This is the default setting.
.RB "[config file: " psLevel ]
.TP
.B \-level2sep
Generate Level 2 separable PostScript. All colors are converted to
CMYK. The PostScript separation convention operators are used to
handle custom (spot) colors.
.RB "[config file: " psLevel ]
.TP
.B \-level3
Generate Level 3 PostScript. This enables all Level 2 features plus
CID font embedding.
.RB "[config file: " psLevel ]
.TP
.B \-level3Sep
Generate Level 3 separable PostScript. The separation handling is the
same as for -level2Sep.
.RB "[config file: " psLevel ]
.TP
.B \-eps
Generate an Encapsulated PostScript (EPS) file. An EPS file contains
a single image, so if you use this option with a multi-page PDF file,
you must use -f and -l to specify a single page. No more than one of
the mode options (-eps, -form) may be given.
.TP
.B \-form
Generate a PostScript form which can be imported by software that
understands forms. A form contains a single page, so if you use this
option with a multi-page PDF file, you must use -f and -l to specify a
single page. The -level1 option cannot be used with -form.
.TP
.B \-opi
Generate OPI comments for all images and forms which have OPI
information. (This option is only available if pdftops was compiled
with OPI support.)
.RB "[config file: " psOPI ]
.TP
.B \-noembt1
By default, any Type 1 fonts which are embedded in the PDF file are
copied into the PostScript file. This option causes pdftops to
substitute base fonts instead. Embedded fonts make PostScript files
larger, but may be necessary for readable output.
.RB "[config file: " psEmbedType1Fonts ]
.TP
.B \-noembtt
By default, any TrueType fonts which are embedded in the PDF file are
copied into the PostScript file. This option causes pdftops to
substitute base fonts instead. Embedded fonts make PostScript files
larger, but may be necessary for readable output. Also, some
PostScript interpreters do not have TrueType rasterizers.
.RB "[config file: " psEmbedTrueTypeFonts ]
.TP
.B \-noembcidps
By default, any CID PostScript fonts which are embedded in the PDF
file are copied into the PostScript file. This option disables that
embedding. No attempt is made to substitute for non-embedded CID
PostScript fonts.
.RB "[config file: " psEmbedCIDPostScriptFonts ]
.TP
.B \-noembcidtt
By default, any CID TrueType fonts which are embedded in the PDF file
are copied into the PostScript file. This option disables that
embedding. No attempt is made to substitute for non-embedded CID
TrueType fonts.
.RB "[config file: " psEmbedCIDTrueTypeFonts ]
.TP
.BI \-paper " size"
Set the paper size to one of "letter", "legal", "A4", or "A3". This
can also be set to "match", which will set the paper size to match the
size specified in the PDF file.
.RB "[config file: " psPaperSize ]
.TP
.BI \-paperw " size"
Set the paper width, in points.
.RB "[config file: " psPaperSize ]
.TP
.BI \-paperh " size"
Set the paper height, in points.
.RB "[config file: " psPaperSize ]
.TP
.B \-nocrop
By default, output is cropped to the CropBox specified in the PDF
file. This option disables cropping.
.RB "[config file: " psCrop ]
.TP
.B \-expand
Expand PDF pages smaller than the paper to fill the paper. By
default, these pages are not scaled.
.RB "[config file: " psExpandSmaller ]
.TP
.B \-noshrink
Don't scale PDF pages which are larger than the paper. By default,
pages larger than the paper are shrunk to fit.
.RB "[config file: " psShrinkLarger ]
.TP
.B \-nocenter
By default, PDF pages smaller than the paper (after any scaling) are
centered on the paper. This option causes them to be aligned to the
lower-left corner of the paper instead.
.RB "[config file: " psCenter ]
.TP
.B \-duplex
Set the Duplex pagedevice entry in the PostScript file. This tells
duplex-capable printers to enable duplexing.
.RB "[config file: " psDuplex ]
.TP
.BI \-opw " password"
Specify the owner password for the PDF file. Providing this will
bypass all security restrictions.
.TP
.BI \-upw " password"
Specify the user password for the PDF file.
.TP
.B \-q
Don't print any messages or errors.
.RB "[config file: " errQuiet ]
.TP
.BI \-cfg " config-file"
Read
.I config-file
in place of ~/.xpdfrc or the system-wide config file.
.TP
.B \-v
Print copyright and version information.
.TP
.B \-h
Print usage information.
.RB ( \-help
and
.B \-\-help
are equivalent.)
.SH EXIT CODES
The Xpdf tools use the following exit codes:
.TP
0
No error.
.TP
1
Error opening a PDF file.
.TP
2
Error opening an output file.
.TP
3
Error related to PDF permissions.
.TP
99
Other error.
.SH AUTHOR
The pdftops software and documentation are copyright 1996-2004 Glyph &
Cog, LLC.
.SH "SEE ALSO"
.BR xpdf (1),
.BR pdftotext (1),
.BR pdfinfo (1),
.BR pdffonts (1),
.BR pdftoppm (1),
.BR pdfimages (1),
.BR xpdfrc (5)
.br
.B http://www.foolabs.com/xpdf/
--- NEW FILE: pdftops.cc ---
//========================================================================
//
// pdftops.cc
//
// Copyright 1996-2003 Glyph & Cog, LLC
//
// Modified for Debian by Hamish Moffatt, 22 May 2002.
//
//========================================================================
#include <poppler-config.h>
#include <stdio.h>
#include <stdlib.h>
#include <stddef.h>
#include <string.h>
#include "parseargs.h"
#include "goo/GooString.h"
#include "goo/gmem.h"
#include "GlobalParams.h"
#include "Object.h"
#include "Stream.h"
#include "Array.h"
#include "Dict.h"
#include "XRef.h"
#include "Catalog.h"
#include "Page.h"
#include "PDFDoc.h"
#include "PSOutputDev.h"
#include "Error.h"
#include "config.h"
static int firstPage = 1;
static int lastPage = 0;
static GBool level1 = gFalse;
static GBool level1Sep = gFalse;
static GBool level2 = gFalse;
static GBool level2Sep = gFalse;
static GBool level3 = gFalse;
static GBool level3Sep = gFalse;
static GBool doEPS = gFalse;
static GBool doForm = gFalse;
#if OPI_SUPPORT
static GBool doOPI = gFalse;
#endif
static GBool noEmbedT1Fonts = gFalse;
static GBool noEmbedTTFonts = gFalse;
static GBool noEmbedCIDPSFonts = gFalse;
static GBool noEmbedCIDTTFonts = gFalse;
static char paperSize[15] = "";
static int paperWidth = 0;
static int paperHeight = 0;
static GBool noCrop = gFalse;
static GBool expand = gFalse;
static GBool noShrink = gFalse;
static GBool noCenter = gFalse;
static GBool duplex = gFalse;
static char ownerPassword[33] = "\001";
static char userPassword[33] = "\001";
static GBool quiet = gFalse;
static char cfgFileName[256] = "";
static GBool printVersion = gFalse;
static GBool printHelp = gFalse;
static ArgDesc argDesc[] = {
{"-f", argInt, &firstPage, 0,
"first page to print"},
{"-l", argInt, &lastPage, 0,
"last page to print"},
{"-level1", argFlag, &level1, 0,
"generate Level 1 PostScript"},
{"-level1sep", argFlag, &level1Sep, 0,
"generate Level 1 separable PostScript"},
{"-level2", argFlag, &level2, 0,
"generate Level 2 PostScript"},
{"-level2sep", argFlag, &level2Sep, 0,
"generate Level 2 separable PostScript"},
{"-level3", argFlag, &level3, 0,
"generate Level 3 PostScript"},
{"-level3sep", argFlag, &level3Sep, 0,
"generate Level 3 separable PostScript"},
{"-eps", argFlag, &doEPS, 0,
"generate Encapsulated PostScript (EPS)"},
{"-form", argFlag, &doForm, 0,
"generate a PostScript form"},
#if OPI_SUPPORT
{"-opi", argFlag, &doOPI, 0,
"generate OPI comments"},
#endif
{"-noembt1", argFlag, &noEmbedT1Fonts, 0,
"don't embed Type 1 fonts"},
{"-noembtt", argFlag, &noEmbedTTFonts, 0,
"don't embed TrueType fonts"},
{"-noembcidps", argFlag, &noEmbedCIDPSFonts, 0,
"don't embed CID PostScript fonts"},
{"-noembcidtt", argFlag, &noEmbedCIDTTFonts, 0,
"don't embed CID TrueType fonts"},
{"-paper", argString, paperSize, sizeof(paperSize),
"paper size (letter, legal, A4, A3, match)"},
{"-paperw", argInt, &paperWidth, 0,
"paper width, in points"},
{"-paperh", argInt, &paperHeight, 0,
"paper height, in points"},
{"-nocrop", argFlag, &noCrop, 0,
"don't crop pages to CropBox"},
{"-expand", argFlag, &expand, 0,
"expand pages smaller than the paper size"},
{"-noshrink", argFlag, &noShrink, 0,
"don't shrink pages larger than the paper size"},
{"-nocenter", argFlag, &noCenter, 0,
"don't center pages smaller than the paper size"},
{"-duplex", argFlag, &duplex, 0,
"enable duplex printing"},
{"-opw", argString, ownerPassword, sizeof(ownerPassword),
"owner password (for encrypted files)"},
{"-upw", argString, userPassword, sizeof(userPassword),
"user password (for encrypted files)"},
{"-q", argFlag, &quiet, 0,
"don't print any messages or errors"},
{"-cfg", argString, cfgFileName, sizeof(cfgFileName),
"configuration file to use in place of .xpdfrc"},
{"-v", argFlag, &printVersion, 0,
"print copyright and version info"},
{"-h", argFlag, &printHelp, 0,
"print usage information"},
{"-help", argFlag, &printHelp, 0,
"print usage information"},
{"--help", argFlag, &printHelp, 0,
"print usage information"},
{"-?", argFlag, &printHelp, 0,
"print usage information"},
{NULL}
};
int main(int argc, char *argv[]) {
PDFDoc *doc;
GooString *fileName;
GooString *psFileName;
PSLevel level;
PSOutMode mode;
GooString *ownerPW, *userPW;
PSOutputDev *psOut;
GBool ok;
char *p;
int exitCode;
exitCode = 99;
// parse args
ok = parseArgs(argDesc, &argc, argv);
if (!ok || argc < 2 || argc > 3 || printVersion || printHelp) {
fprintf(stderr, "pdftops version %s\n", xpdfVersion);
fprintf(stderr, "%s\n", xpdfCopyright);
if (!printVersion) {
printUsage("pdftops", "<PDF-file> [<PS-file>]", argDesc);
}
exit(1);
}
if ((level1 ? 1 : 0) +
(level1Sep ? 1 : 0) +
(level2 ? 1 : 0) +
(level2Sep ? 1 : 0) +
(level3 ? 1 : 0) +
(level3Sep ? 1 : 0) > 1) {
fprintf(stderr, "Error: use only one of the 'level' options.\n");
exit(1);
}
if (doEPS && doForm) {
fprintf(stderr, "Error: use only one of -eps and -form\n");
exit(1);
}
if (level1) {
level = psLevel1;
} else if (level1Sep) {
level = psLevel1Sep;
} else if (level2Sep) {
level = psLevel2Sep;
} else if (level3) {
level = psLevel3;
} else if (level3Sep) {
level = psLevel3Sep;
} else {
level = psLevel2;
}
if (doForm && level < psLevel2) {
fprintf(stderr, "Error: forms are only available with Level 2 output.\n");
exit(1);
}
mode = doEPS ? psModeEPS
: doForm ? psModeForm
: psModePS;
fileName = new GooString(argv[1]);
// read config file
globalParams = new GlobalParams(cfgFileName);
if (paperSize[0]) {
if (!globalParams->setPSPaperSize(paperSize)) {
fprintf(stderr, "Invalid paper size\n");
delete fileName;
goto err0;
}
} else {
if (paperWidth) {
globalParams->setPSPaperWidth(paperWidth);
}
if (paperHeight) {
globalParams->setPSPaperHeight(paperHeight);
}
}
if (noCrop) {
globalParams->setPSCrop(gFalse);
}
if (expand) {
globalParams->setPSExpandSmaller(gTrue);
}
if (noShrink) {
globalParams->setPSShrinkLarger(gFalse);
}
if (noCenter) {
globalParams->setPSCenter(gFalse);
}
if (duplex) {
globalParams->setPSDuplex(duplex);
}
if (level1 || level1Sep || level2 || level2Sep || level3 || level3Sep) {
globalParams->setPSLevel(level);
}
if (noEmbedT1Fonts) {
globalParams->setPSEmbedType1(!noEmbedT1Fonts);
}
if (noEmbedTTFonts) {
globalParams->setPSEmbedTrueType(!noEmbedTTFonts);
}
if (noEmbedCIDPSFonts) {
globalParams->setPSEmbedCIDPostScript(!noEmbedCIDPSFonts);
}
if (noEmbedCIDTTFonts) {
globalParams->setPSEmbedCIDTrueType(!noEmbedCIDTTFonts);
}
#if OPI_SUPPORT
if (doOPI) {
globalParams->setPSOPI(doOPI);
}
#endif
if (quiet) {
globalParams->setErrQuiet(quiet);
}
// open PDF file
if (ownerPassword[0] != '\001') {
ownerPW = new GooString(ownerPassword);
} else {
ownerPW = NULL;
}
if (userPassword[0] != '\001') {
userPW = new GooString(userPassword);
} else {
userPW = NULL;
}
doc = new PDFDoc(fileName, ownerPW, userPW);
if (userPW) {
delete userPW;
}
if (ownerPW) {
delete ownerPW;
}
if (!doc->isOk()) {
exitCode = 1;
goto err1;
}
#ifdef ENFORCE_PERMISSIONS
// check for print permission
if (!doc->okToPrint()) {
error(-1, "Printing this document is not allowed.");
exitCode = 3;
goto err1;
}
#endif
// construct PostScript file name
if (argc == 3) {
psFileName = new GooString(argv[2]);
} else {
p = fileName->getCString() + fileName->getLength() - 4;
if (!strcmp(p, ".pdf") || !strcmp(p, ".PDF")) {
psFileName = new GooString(fileName->getCString(),
fileName->getLength() - 4);
} else {
psFileName = fileName->copy();
}
psFileName->append(doEPS ? ".eps" : ".ps");
}
// get page range
if (firstPage < 1) {
firstPage = 1;
}
if (lastPage < 1 || lastPage > doc->getNumPages()) {
lastPage = doc->getNumPages();
}
// check for multi-page EPS or form
if ((doEPS || doForm) && firstPage != lastPage) {
error(-1, "EPS and form files can only contain one page.");
goto err2;
}
// write PostScript file
psOut = new PSOutputDev(psFileName->getCString(), doc->getXRef(),
doc->getCatalog(), firstPage, lastPage, mode);
if (psOut->isOk()) {
doc->displayPages(psOut, firstPage, lastPage, 72, 72,
0, globalParams->getPSCrop(), gFalse, gFalse);
} else {
delete psOut;
exitCode = 2;
goto err2;
}
delete psOut;
exitCode = 0;
// clean up
err2:
delete psFileName;
err1:
delete doc;
err0:
delete globalParams;
// check for memory leaks
Object::memCheck(stderr);
gMemReport(stderr);
return exitCode;
}
--- NEW FILE: pdftotext.1 ---
.\" Copyright 1997-2004 Glyph & Cog, LLC
.TH pdftotext 1 "22 January 2004"
.SH NAME
pdftotext \- Portable Document Format (PDF) to text converter
(version 3.00)
.SH SYNOPSIS
.B pdftotext
[options]
.RI [ PDF-file
.RI [ text-file ]]
.SH DESCRIPTION
.B Pdftotext
converts Portable Document Format (PDF) files to plain text.
.PP
Pdftotext reads the PDF file,
.IR PDF-file ,
and writes a text file,
.IR text-file .
If
.I text-file
is not specified, pdftotext converts
.I file.pdf
to
.IR file.txt .
If
.I text-file
is \'-', the text is sent to stdout.
.SH CONFIGURATION FILE
Pdftotext reads a configuration file at startup. It first tries to
find the user's private config file, ~/.xpdfrc. If that doesn't
exist, it looks for a system-wide config file, /etc/xpdf/xpdfrc. See the
.BR xpdfrc (5)
man page for details.
.SH OPTIONS
Many of the following options can be set with configuration file
commands. These are listed in square brackets with the description of
the corresponding command line option.
.TP
.BI \-f " number"
Specifies the first page to convert.
.TP
.BI \-l " number"
Specifies the last page to convert.
.TP
.B \-layout
Maintain (as best as possible) the original physical layout of the
text. The default is to \'undo' physical layout (columns,
hyphenation, etc.) and output the text in reading order.
.TP
.B \-raw
Keep the text in content stream order. This is a hack which often
"undoes" column formatting, etc. Use of raw mode is no longer
recommended.
.TP
.B \-htmlmeta
Generate a simple HTML file, including the meta information. This
simply wraps the text in <pre> and </pre> and prepends the meta
headers.
.TP
.BI \-enc " encoding-name"
Sets the encoding to use for text output. The
.I encoding\-name
must be defined with the unicodeMap command (see
.BR xpdfrc (5)).
The encoding name is case-sensitive. This defaults to "Latin1" (which
is a built-in encoding).
.RB "[config file: " textEncoding ]
.TP
.BI \-eol " unix | dos | mac"
Sets the end-of-line convention to use for text output.
.RB "[config file: " textEOL ]
.TP
.B \-nopgbrk
Don't insert page breaks (form feed characters) between pages.
.RB "[config file: " textPageBreaks ]
.TP
.BI \-opw " password"
Specify the owner password for the PDF file. Providing this will
bypass all security restrictions.
.TP
.BI \-upw " password"
Specify the user password for the PDF file.
.TP
.B \-q
Don't print any messages or errors.
.RB "[config file: " errQuiet ]
.TP
.BI \-cfg " config-file"
Read
.I config-file
in place of ~/.xpdfrc or the system-wide config file.
.TP
.B \-v
Print copyright and version information.
.TP
.B \-h
Print usage information.
.RB ( \-help
and
.B \-\-help
are equivalent.)
.SH BUGS
Some PDF files contain fonts whose encodings have been mangled beyond
recognition. There is no way (short of OCR) to extract text from
these files.
.SH EXIT CODES
The Xpdf tools use the following exit codes:
.TP
0
No error.
.TP
1
Error opening a PDF file.
.TP
2
Error opening an output file.
.TP
3
Error related to PDF permissions.
.TP
99
Other error.
.SH AUTHOR
The pdftotext software and documentation are copyright 1996-2004 Glyph
& Cog, LLC.
.SH "SEE ALSO"
.BR xpdf (1),
.BR pdftops (1),
.BR pdfinfo (1),
.BR pdffonts (1),
.BR pdftoppm (1),
.BR pdfimages (1),
.BR xpdfrc (5)
.br
.B http://www.foolabs.com/xpdf/
--- NEW FILE: pdftotext.cc ---
//========================================================================
//
// pdftotext.cc
//
// Copyright 1997-2003 Glyph & Cog, LLC
//
// Modified for Debian by Hamish Moffatt, 22 May 2002.
//
//========================================================================
#include <poppler-config.h>
#include <stdio.h>
#include <stdlib.h>
#include <stddef.h>
#include <string.h>
#include "parseargs.h"
#include "goo/GooString.h"
#include "goo/gmem.h"
#include "GlobalParams.h"
#include "Object.h"
#include "Stream.h"
#include "Array.h"
#include "Dict.h"
#include "XRef.h"
#include "Catalog.h"
#include "Page.h"
#include "PDFDoc.h"
#include "TextOutputDev.h"
#include "CharTypes.h"
#include "UnicodeMap.h"
#include "Error.h"
#include "config.h"
static void printInfoString(FILE *f, Dict *infoDict, char *key,
char *text1, char *text2, UnicodeMap *uMap);
static void printInfoDate(FILE *f, Dict *infoDict, char *key, char *fmt);
static int firstPage = 1;
static int lastPage = 0;
static GBool physLayout = gFalse;
static GBool rawOrder = gFalse;
static GBool htmlMeta = gFalse;
static char textEncName[128] = "";
static char textEOL[16] = "";
static GBool noPageBreaks = gFalse;
static char ownerPassword[33] = "\001";
static char userPassword[33] = "\001";
static GBool quiet = gFalse;
static char cfgFileName[256] = "";
static GBool printVersion = gFalse;
static GBool printHelp = gFalse;
static ArgDesc argDesc[] = {
{"-f", argInt, &firstPage, 0,
"first page to convert"},
{"-l", argInt, &lastPage, 0,
"last page to convert"},
{"-layout", argFlag, &physLayout, 0,
"maintain original physical layout"},
{"-raw", argFlag, &rawOrder, 0,
"keep strings in content stream order"},
{"-htmlmeta", argFlag, &htmlMeta, 0,
"generate a simple HTML file, including the meta information"},
{"-enc", argString, textEncName, sizeof(textEncName),
"output text encoding name"},
{"-eol", argString, textEOL, sizeof(textEOL),
"output end-of-line convention (unix, dos, or mac)"},
{"-nopgbrk", argFlag, &noPageBreaks, 0,
"don't insert page breaks between pages"},
{"-opw", argString, ownerPassword, sizeof(ownerPassword),
"owner password (for encrypted files)"},
{"-upw", argString, userPassword, sizeof(userPassword),
"user password (for encrypted files)"},
{"-q", argFlag, &quiet, 0,
"don't print any messages or errors"},
{"-cfg", argString, cfgFileName, sizeof(cfgFileName),
"configuration file to use in place of .xpdfrc"},
{"-v", argFlag, &printVersion, 0,
"print copyright and version info"},
{"-h", argFlag, &printHelp, 0,
"print usage information"},
{"-help", argFlag, &printHelp, 0,
"print usage information"},
{"--help", argFlag, &printHelp, 0,
"print usage information"},
{"-?", argFlag, &printHelp, 0,
"print usage information"},
{NULL}
};
int main(int argc, char *argv[]) {
PDFDoc *doc;
GooString *fileName;
GooString *textFileName;
GooString *ownerPW, *userPW;
TextOutputDev *textOut;
FILE *f;
UnicodeMap *uMap;
Object info;
GBool ok;
char *p;
int exitCode;
exitCode = 99;
// parse args
ok = parseArgs(argDesc, &argc, argv);
if (!ok || argc < 2 || argc > 3 || printVersion || printHelp) {
fprintf(stderr, "pdftotext version %s\n", xpdfVersion);
fprintf(stderr, "%s\n", xpdfCopyright);
if (!printVersion) {
printUsage("pdftotext", "<PDF-file> [<text-file>]", argDesc);
}
goto err0;
}
fileName = new GooString(argv[1]);
// read config file
globalParams = new GlobalParams(cfgFileName);
if (textEncName[0]) {
globalParams->setTextEncoding(textEncName);
}
if (textEOL[0]) {
if (!globalParams->setTextEOL(textEOL)) {
fprintf(stderr, "Bad '-eol' value on command line\n");
}
}
if (noPageBreaks) {
globalParams->setTextPageBreaks(gFalse);
}
if (quiet) {
globalParams->setErrQuiet(quiet);
}
// get mapping to output encoding
if (!(uMap = globalParams->getTextEncoding())) {
error(-1, "Couldn't get text encoding");
delete fileName;
goto err1;
}
// open PDF file
if (ownerPassword[0] != '\001') {
ownerPW = new GooString(ownerPassword);
} else {
ownerPW = NULL;
}
if (userPassword[0] != '\001') {
userPW = new GooString(userPassword);
} else {
userPW = NULL;
}
doc = new PDFDoc(fileName, ownerPW, userPW);
if (userPW) {
delete userPW;
}
if (ownerPW) {
delete ownerPW;
}
if (!doc->isOk()) {
exitCode = 1;
goto err2;
}
#ifdef ENFORCE_PERMISSIONS
// check for copy permission
if (!doc->okToCopy()) {
error(-1, "Copying of text from this document is not allowed.");
exitCode = 3;
goto err2;
}
#endif
// construct text file name
if (argc == 3) {
textFileName = new GooString(argv[2]);
} else {
p = fileName->getCString() + fileName->getLength() - 4;
if (!strcmp(p, ".pdf") || !strcmp(p, ".PDF")) {
textFileName = new GooString(fileName->getCString(),
fileName->getLength() - 4);
} else {
textFileName = fileName->copy();
}
textFileName->append(htmlMeta ? ".html" : ".txt");
}
// get page range
if (firstPage < 1) {
firstPage = 1;
}
if (lastPage < 1 || lastPage > doc->getNumPages()) {
lastPage = doc->getNumPages();
}
// write HTML header
if (htmlMeta) {
if (!textFileName->cmp("-")) {
f = stdout;
} else {
if (!(f = fopen(textFileName->getCString(), "wb"))) {
error(-1, "Couldn't open text file '%s'", textFileName->getCString());
exitCode = 2;
goto err3;
}
}
fputs("<html>\n", f);
fputs("<head>\n", f);
doc->getDocInfo(&info);
if (info.isDict()) {
printInfoString(f, info.getDict(), "Title", "<title>", "</title>\n",
uMap);
printInfoString(f, info.getDict(), "Subject",
"<meta name=\"Subject\" content=\"", "\">\n", uMap);
printInfoString(f, info.getDict(), "Keywords",
"<meta name=\"Keywords\" content=\"", "\">\n", uMap);
printInfoString(f, info.getDict(), "Author",
"<meta name=\"Author\" content=\"", "\">\n", uMap);
printInfoString(f, info.getDict(), "Creator",
"<meta name=\"Creator\" content=\"", "\">\n", uMap);
printInfoString(f, info.getDict(), "Producer",
"<meta name=\"Producer\" content=\"", "\">\n", uMap);
printInfoDate(f, info.getDict(), "CreationDate",
"<meta name=\"CreationDate\" content=\"\">\n");
printInfoDate(f, info.getDict(), "LastModifiedDate",
"<meta name=\"ModDate\" content=\"\">\n");
}
info.free();
fputs("</head>\n", f);
fputs("<body>\n", f);
fputs("<pre>\n", f);
if (f != stdout) {
fclose(f);
}
}
// write text file
textOut = new TextOutputDev(textFileName->getCString(),
physLayout, rawOrder, htmlMeta);
if (textOut->isOk()) {
doc->displayPages(textOut, firstPage, lastPage, 72, 72, 0,
gTrue, gFalse, gFalse);
} else {
delete textOut;
exitCode = 2;
goto err3;
}
delete textOut;
// write end of HTML file
if (htmlMeta) {
if (!textFileName->cmp("-")) {
f = stdout;
} else {
if (!(f = fopen(textFileName->getCString(), "ab"))) {
error(-1, "Couldn't open text file '%s'", textFileName->getCString());
exitCode = 2;
goto err3;
}
}
fputs("</pre>\n", f);
fputs("</body>\n", f);
fputs("</html>\n", f);
if (f != stdout) {
fclose(f);
}
}
exitCode = 0;
// clean up
err3:
delete textFileName;
err2:
delete doc;
uMap->decRefCnt();
err1:
delete globalParams;
err0:
// check for memory leaks
Object::memCheck(stderr);
gMemReport(stderr);
return exitCode;
}
static void printInfoString(FILE *f, Dict *infoDict, char *key,
char *text1, char *text2, UnicodeMap *uMap) {
Object obj;
GooString *s1;
GBool isUnicode;
Unicode u;
char buf[8];
int i, n;
if (infoDict->lookup(key, &obj)->isString()) {
fputs(text1, f);
s1 = obj.getString();
if ((s1->getChar(0) & 0xff) == 0xfe &&
(s1->getChar(1) & 0xff) == 0xff) {
isUnicode = gTrue;
i = 2;
} else {
isUnicode = gFalse;
i = 0;
}
while (i < obj.getString()->getLength()) {
if (isUnicode) {
u = ((s1->getChar(i) & 0xff) << 8) |
(s1->getChar(i+1) & 0xff);
i += 2;
} else {
u = s1->getChar(i) & 0xff;
++i;
}
n = uMap->mapUnicode(u, buf, sizeof(buf));
fwrite(buf, 1, n, f);
}
fputs(text2, f);
}
obj.free();
}
static void printInfoDate(FILE *f, Dict *infoDict, char *key, char *fmt) {
Object obj;
char *s;
if (infoDict->lookup(key, &obj)->isString()) {
s = obj.getString()->getCString();
if (s[0] == 'D' && s[1] == ':') {
s += 2;
}
fprintf(f, fmt, s);
}
obj.free();
}
More information about the poppler
mailing list