[poppler] extracting line numbers from text word list
kim branson
kim at gliimpse.com
Tue Jan 21 13:07:48 PST 2014
We’ve been using poppler as a python extension module to turn a pdf to text and extract information about each token. We construct a textOutputDevice and then a textWordList from that, returning the font etc for each term.
One thing we’d like to add is which line the token appears on, and optionally its index in that line. Is there an easy way to do this given a TextOutDev ?
#include "poppler.h"
#include "TextOutputDev.h"
#include <sstream>
#include <cstring>
#include "PDFDocFactory.h"
const double PopplerParser::resolution = 72.0;
PopplerParser::PopplerParser (const std::string inputFilename) {
GooString *ownerPW, *userPW;
ownerPW = NULL;
userPW = NULL; //assume no user and owner passwords
char st[inputFilename.length()+1];
GooString* fileName;
fileName = new GooString(st);
//create the document
//assumes no owner or userpassword
PopplerParser::doc = PDFDocFactory().createPDFDoc(*fileName, ownerPW, userPW);
PopplerParser::numPages = PopplerParser::doc->getNumPages();
delete fileName;
int PopplerParser::getPages() {
return PopplerParser::numPages;
PopplerParser::~PopplerParser() {
//delete PopplerParser::numPages;
delete PopplerParser::doc;
std::string PopplerParser::Parse() {
GBool physLayout = gTrue;
GBool fixedPitch = gFalse;
GBool rawOrder = gFalse;
GBool htmlMeta = gTrue; // required to get the bounding box information
int firstPage = 1;
int lastPage = PopplerParser::doc->getNumPages();
TextOutputDev *textOut;
std::string page_text;
std::string pages_text_data;
std::stringstream ss;
//Word Features
double xMinA, yMinA, xMaxA, yMaxA, r, g, b, fontSize;
TextWord *word;
GooString* fontName;
GBool underLined;
TextFontInfo *fontInfo;
GBool fixedWidth = gFalse;
GBool serif = gFalse;
GBool symbolic = gFalse;
GBool italic = gFalse;
GBool bold =gFalse;
//create our page
// read config file this is requried
globalParams = new GlobalParams();
//create a textOut
textOut = new TextOutputDev(NULL, physLayout, fixedPitch, rawOrder, htmlMeta);
//walk over the pages
for (int page = firstPage; page <= lastPage; ++page) {
PopplerParser::doc->displayPage(textOut, page, resolution, resolution, 0, gTrue, gFalse, gFalse);
TextWordList *wordlist = textOut->makeWordList();
const int word_length = wordlist != NULL ? wordlist->getLength() : 0;
if (word_length > 0) {
//words on the page
for (int i = 0; i < word_length; ++i) {
word = wordlist->get(i);
//Word Features
word->getColor(&r , &g, &b);
underLined = word->isUnderlined();
fontSize = word->getFontSize();
word->getBBox(&xMinA, &yMinA, &xMaxA, &yMaxA);
fontName = word->getFontName(0);
const std::string wordString = word->getText()->getCString();
fontInfo = word->getFontInfo(0); //do this for the first char in the word
fontName = fontInfo->getFontName();
fixedWidth = fontInfo ->isFixedWidth();
serif = fontInfo->isSerif();
symbolic = fontInfo->isSymbolic();
italic = fontInfo->isItalic();
bold = fontInfo->isBold();
// escape quotes in string
std::stringstream newStr;
for (int i = 0; i < wordString.length(); ++i) {
if (wordString[i] == '"' || wordString[i] == '\\') {
newStr << "\\";
newStr << wordString[i];
//construct our string output
ss << "{"
<< "\"xMin\":\"" << xMinA << "\",\"yMin\":\"" << yMinA << "\",\"xMax\":\"" << xMaxA << "\",\"yMax\":\"" << yMaxA
<< "\",\"red\":\"" << r << "\",\"green\":\"" << g << "\",\"blue\":\""<< b
<< "\",\"fontSize\":\"" << fontSize
<< "\",\"italic\":\"" << italic
<< "\",\"serif\":\"" << serif
<< "\",\"symbolic\":\"" << symbolic
<< "\",\"fixedWidth\":\"" << fixedWidth
<< "\",\"bold\":\"" << bold
<< "\",\"fontName\":\"" << fontName->getCString()
<< "\",\"word\":\"" << newStr.str() << "\",\"page\":\""<< page
<< "\"}"
<< std::endl;
//std::cout << ss.str() << std::endl;
delete textOut;
delete globalParams;
//delete wordlist;
pages_text_data = ss.str();
return pages_text_data;
-------------- next part --------------
An HTML attachment was scrubbed...
URL: <http://lists.freedesktop.org/archives/poppler/attachments/20140121/f2074d4f/attachment-0001.html>
More information about the poppler
mailing list