[poppler] extracting line numbers from text word list

kim branson kim at gliimpse.com
Tue Jan 21 13:07:48 PST 2014


Hi, 

We’ve been using poppler as a python extension module to turn a pdf to text and  extract information about each token.  We construct a textOutputDevice and then a textWordList from that, returning the font etc for each term.  

One thing we’d like to add is which line the token appears on, and optionally its index in that line.   Is there an easy way to do this given a TextOutDev ?  

-kim 


#include "poppler.h"
#include "TextOutputDev.h"
#include <sstream>
#include <cstring>
#include "PDFDocFactory.h"


const double PopplerParser::resolution = 72.0;

PopplerParser::PopplerParser (const std::string inputFilename) {
		GooString *ownerPW, *userPW;
		ownerPW = NULL;
		userPW = NULL;   //assume no user and owner passwords
		char st[inputFilename.length()+1];
		strcpy(st,inputFilename.c_str());
		GooString* fileName;
		fileName = new  GooString(st);
		//create the document
		//assumes no owner or userpassword
		PopplerParser::doc = PDFDocFactory().createPDFDoc(*fileName, ownerPW, userPW);
		PopplerParser::numPages = PopplerParser::doc->getNumPages();
		delete fileName;
	}

int PopplerParser::getPages() {
	return PopplerParser::numPages; 
}


PopplerParser::~PopplerParser() {
	//delete PopplerParser::numPages;
	delete PopplerParser::doc;
}

std::string PopplerParser::Parse() {

		GBool physLayout = gTrue;
		GBool fixedPitch = gFalse;
		GBool rawOrder = gFalse;
		GBool htmlMeta = gTrue;  // required to get the bounding box information
		int firstPage = 1;
		int lastPage = PopplerParser::doc->getNumPages();

		TextOutputDev *textOut;
		std::string page_text;
		std::string pages_text_data;

		std::stringstream ss;

		//Word Features
		double xMinA, yMinA, xMaxA, yMaxA, r, g, b, fontSize;
		TextWord *word;
		GooString* fontName;
		GBool underLined;
		
		TextFontInfo *fontInfo; 

 		GBool fixedWidth = gFalse; 
  		GBool serif = gFalse;
  		GBool symbolic = gFalse;
  		GBool italic = gFalse;
  		GBool bold =gFalse;


		//create our page
		 // read config file this is requried 
  		globalParams = new GlobalParams();

		//create a textOut
		textOut = new TextOutputDev(NULL, physLayout, fixedPitch, rawOrder, htmlMeta);
		
		//walk over the pages
		for (int page = firstPage; page <= lastPage; ++page) {
			PopplerParser::doc->displayPage(textOut, page, resolution, resolution, 0, gTrue, gFalse, gFalse);
			TextWordList *wordlist = textOut->makeWordList();
			const int word_length = wordlist != NULL ? wordlist->getLength() : 0;

			if (word_length > 0) {
				//words on the page
        		for (int i = 0; i < word_length; ++i) {
          			word = wordlist->get(i);

          			//Word Features
          			word->getColor(&r , &g, &b);
          			underLined = word->isUnderlined();
          			fontSize = word->getFontSize();
          			word->getBBox(&xMinA, &yMinA, &xMaxA, &yMaxA);
          			fontName = word->getFontName(0);
          			const std::string wordString = word->getText()->getCString();
          			//fontIno
          			fontInfo = word->getFontInfo(0);  //do this for the first char in the word
          			fontName = fontInfo->getFontName();
          			fixedWidth = fontInfo ->isFixedWidth();
          			serif = fontInfo->isSerif();
          			symbolic = fontInfo->isSymbolic();
          			italic = fontInfo->isItalic();
          			bold = fontInfo->isBold();

                    // escape quotes in string
                    std::stringstream newStr;
                    for (int i = 0; i < wordString.length(); ++i) {
                        if (wordString[i] == '"' || wordString[i] == '\\') {
                            newStr << "\\";
                        }                
                        newStr << wordString[i];       
                    }

          			//construct our string output
                    ss << "{"
          			<<  "\"xMin\":\"" << xMinA << "\",\"yMin\":\"" << yMinA << "\",\"xMax\":\"" << xMaxA << "\",\"yMax\":\"" << yMaxA 
          			<< "\",\"red\":\"" << r << "\",\"green\":\"" << g << "\",\"blue\":\""<< b 
          			<< "\",\"fontSize\":\"" << fontSize 
          			<< "\",\"italic\":\"" << italic 
          			<< "\",\"serif\":\"" << serif
          			<< "\",\"symbolic\":\"" << symbolic
          			<< "\",\"fixedWidth\":\"" << fixedWidth
          			<< "\",\"bold\":\""  << bold
          			<< "\",\"fontName\":\"" << fontName->getCString()
          			<< "\",\"word\":\"" << newStr.str() << "\",\"page\":\""<< page 
                    << "\"}"
                    << std::endl;
          			//std::cout << ss.str() << std::endl;
        		}
			}
		}

		delete textOut;
		delete globalParams;
		//delete wordlist;
		pages_text_data = ss.str();
		return pages_text_data;
}
-------------- next part --------------
An HTML attachment was scrubbed...
URL: <http://lists.freedesktop.org/archives/poppler/attachments/20140121/f2074d4f/attachment-0001.html>


More information about the poppler mailing list