[PATCH 1/3] Add C++ HelpIndexer

Gert van Valkenhoef g.h.m.van.valkenhoef at rug.nl
Tue Feb 14 10:31:18 PST 2012


---
 l10ntools/prj/build.lst               |    2 +-
 l10ntools/prj/d.lst                   |    6 +-
 l10ntools/source/help/helpindexer.cxx |  247 +++++++++++++++++++++++++++++++++
 l10ntools/source/help/makefile.mk     |   30 ++---
 4 files changed, 263 insertions(+), 22 deletions(-)
 create mode 100644 l10ntools/source/help/helpindexer.cxx

diff --git a/l10ntools/prj/build.lst b/l10ntools/prj/build.lst
index ed919a5..8e3ea70 100644
--- a/l10ntools/prj/build.lst
+++ b/l10ntools/prj/build.lst
@@ -1,4 +1,4 @@
-tr l10ntools : BERKELEYDB:berkeleydb EXPAT:expat LIBXSLT:libxslt LUCENE:lucene sal NULL
+tr l10ntools : BERKELEYDB:berkeleydb EXPAT:expat LIBXSLT:libxslt sal NULL
 tr	l10ntools						usr1	-	all	tr_mkout NULL
 tr	l10ntools\inc					nmake	-	all	tr_inc NULL
 tr	l10ntools\source					nmake	-	all	tr_src tr_inc NULL
diff --git a/l10ntools/prj/d.lst b/l10ntools/prj/d.lst
index eded848..174bb6c 100644
--- a/l10ntools/prj/d.lst
+++ b/l10ntools/prj/d.lst
@@ -26,12 +26,14 @@ mkdir: %_DEST%\bin\help\com\sun\star\help
 ..\%__SRC%\bin\txtconv %_DEST%\bin\txtconv
 ..\%__SRC%\bin\ulfconv %_DEST%\bin\ulfconv
 ..\%__SRC%\class\FCFGMerge.jar %_DEST%\bin\FCFGMerge.jar
-..\%__SRC%\class\HelpIndexerTool.jar %_DEST%\bin\HelpIndexerTool.jar
-..\%__SRC%\bin\HelpLinker %_DEST%\bin\HelpLinker
 ..\%__SRC%\bin\HelpCompiler %_DEST%\bin\HelpCompiler
 ..\%__SRC%\bin\HelpCompiler.exe %_DEST%\bin\HelpCompiler.exe
+..\%__SRC%\bin\HelpLinker %_DEST%\bin\HelpLinker
 ..\%__SRC%\bin\HelpLinker.exe %_DEST%\bin\HelpLinker.exe
 ..\%__SRC%\bin\HelpLinker* %_DEST%\bin
+..\%__SRC%\bin\HelpIndexer %_DEST%\bin\HelpIndexer
+..\%__SRC%\bin\HelpIndexer.exe %_DEST%\bin\HelpIndexer.exe
+..\%__SRC%\bin\HelpIndexer* %_DEST%\bin
 
 ..\scripts\localize %_DEST%\bin\localize
 ..\scripts\fast_merge.pl %_DEST%\bin\fast_merge.pl
diff --git a/l10ntools/source/help/helpindexer.cxx b/l10ntools/source/help/helpindexer.cxx
new file mode 100644
index 0000000..c327119
--- /dev/null
+++ b/l10ntools/source/help/helpindexer.cxx
@@ -0,0 +1,247 @@
+#include <CLucene/StdHeader.h>
+#include <CLucene.h>
+#ifdef TODO
+#include <CLucene/analysis/LanguageBasedAnalyzer.h>
+#endif
+
+#include <unistd.h>
+#include <sys/stat.h>
+#include <dirent.h>
+#include <errno.h>
+#include <string.h>
+
+#include <string>
+#include <iostream>
+#include <algorithm>
+#include <set>
+
+// I assume that TCHAR is defined as wchar_t throughout
+
+using namespace lucene::document;
+
+class HelpIndexer {
+	private:
+		std::string d_lang;
+		std::string d_module;
+		std::string d_captionDir;
+		std::string d_contentDir;
+		std::string d_indexDir;
+		std::string d_error;
+		std::set<std::string> d_files;
+
+	public:
+
+	/**
+	 * @param lang Help files language.
+	 * @param module The module of the helpfiles.
+	 * @param captionDir The directory to scan for caption files.
+	 * @param contentDir The directory to scan for content files.
+	 * @param indexDir The directory to write the index to.
+	 */
+	HelpIndexer(std::string const &lang, std::string const &module,
+		std::string const &captionDir, std::string const &contentDir,
+		std::string const &indexDir);
+
+	/**
+	 * Run the indexer.
+	 * @return true if index successfully generated.
+	 */
+	bool indexDocuments();
+
+	/**
+	 * Get the error string (empty if no error occurred).
+	 */
+	std::string const & getErrorMessage();
+
+	private:
+
+	/**
+	 * Scan the caption & contents directories for help files.
+	 */
+	bool scanForFiles();
+
+	/**
+	 * Scan for files in the given directory.
+	 */
+	bool scanForFiles(std::string const &path);
+
+	/**
+	 * Fill the Document with information on the given help file.
+	 */
+	bool helpDocument(std::string const & fileName, Document *doc);
+
+	/**
+	 * Create a reader for the given file, and create an "empty" reader in case the file doesn't exist.
+	 */
+	lucene::util::Reader *helpFileReader(std::string const & path);
+
+	std::wstring string2wstring(std::string const &source);
+};
+
+HelpIndexer::HelpIndexer(std::string const &lang, std::string const &module,
+	std::string const &captionDir, std::string const &contentDir, std::string const &indexDir) :
+d_lang(lang), d_module(module), d_captionDir(captionDir), d_contentDir(contentDir), d_indexDir(indexDir), d_error(""), d_files() {}
+
+bool HelpIndexer::indexDocuments() {
+	if (!scanForFiles()) {
+		return false;
+	}
+
+#ifdef TODO
+	// Construct the analyzer appropriate for the given language
+	lucene::analysis::Analyzer *analyzer = (
+		d_lang.compare("ja") == 0 ?
+		(lucene::analysis::Analyzer*)new lucene::analysis::LanguageBasedAnalyzer(L"cjk") :
+		(lucene::analysis::Analyzer*)new lucene::analysis::standard::StandardAnalyzer());
+#else
+	lucene::analysis::Analyzer *analyzer = (
+		(lucene::analysis::Analyzer*)new lucene::analysis::standard::StandardAnalyzer());
+#endif
+
+	lucene::index::IndexWriter writer(d_indexDir.c_str(), analyzer, true);
+
+	// Index the identified help files
+	Document doc;
+	for (std::set<std::string>::iterator i = d_files.begin(); i != d_files.end(); ++i) {
+		doc.clear();
+		if (!helpDocument(*i, &doc)) {
+			delete analyzer;
+			return false;
+		}
+		writer.addDocument(&doc);
+	}
+
+	// Optimize the index
+	writer.optimize();
+
+	delete analyzer;
+	return true;
+}
+
+std::string const & HelpIndexer::getErrorMessage() {
+	return d_error;
+}
+
+bool HelpIndexer::scanForFiles() {
+	if (!scanForFiles(d_contentDir)) {
+		return false;
+	}
+	if (!scanForFiles(d_captionDir)) {
+		return false;
+	}
+	return true;
+}
+
+bool HelpIndexer::scanForFiles(std::string const & path) {
+	DIR *dir = opendir(path.c_str());
+	if (dir == 0) {
+		d_error = "Error reading directory " + path + strerror(errno);
+		return true;
+	}
+
+	struct dirent *ent;
+	struct stat info;
+	while ((ent = readdir(dir)) != 0) {
+		if (stat((path + "/" + ent->d_name).c_str(), &info) == 0 && S_ISREG(info.st_mode)) {
+			d_files.insert(ent->d_name);
+		}
+	}
+
+	closedir(dir);
+
+	return true;
+}
+
+bool HelpIndexer::helpDocument(std::string const & fileName, Document *doc) {
+	// Add the help path as an indexed, untokenized field.
+	std::wstring path(L"#HLP#" + string2wstring(d_module) + L"/" + string2wstring(fileName));
+	doc->add(*new Field(_T("path"), path.c_str(), Field::STORE_YES | Field::INDEX_UNTOKENIZED));
+
+	// Add the caption as a field.
+	std::string captionPath = d_captionDir + "/" + fileName;
+	doc->add(*new Field(_T("caption"), helpFileReader(captionPath), Field::STORE_NO | Field::INDEX_TOKENIZED));
+	// FIXME: does the Document take responsibility for the FileReader or should I free it somewhere?
+
+	// Add the content as a field.
+	std::string contentPath = d_contentDir + "/" + fileName;
+	doc->add(*new Field(_T("content"), helpFileReader(contentPath), Field::STORE_NO | Field::INDEX_TOKENIZED));
+	// FIXME: does the Document take responsibility for the FileReader or should I free it somewhere?
+
+	return true;
+}
+
+lucene::util::Reader *HelpIndexer::helpFileReader(std::string const & path) {
+	if (access(path.c_str(), R_OK) == 0) {
+		return new lucene::util::FileReader(path.c_str(), "UTF-8");
+	} else {
+		return new lucene::util::StringReader(L"");
+	}
+}
+
+std::wstring HelpIndexer::string2wstring(std::string const &source) {
+	std::wstring target(source.length(), L' ');
+	std::copy(source.begin(), source.end(), target.begin());
+	return target;
+}
+
+int main(int argc, char **argv) {
+	const std::string pLang("-lang");
+	const std::string pModule("-mod");
+	const std::string pOutDir("-zipdir");
+	const std::string pSrcDir("-srcdir");
+
+	std::string lang;
+	std::string module;
+	std::string srcDir;
+	std::string outDir;
+
+	bool error = false;
+	for (int i = 1; i < argc; ++i) {
+		if (pLang.compare(argv[i]) == 0) {
+			if (i + 1 < argc) {
+				lang = argv[++i];
+			} else {
+				error = true;
+			}
+		} else if (pModule.compare(argv[i]) == 0) {
+			if (i + 1 < argc) {
+				module = argv[++i];
+			} else {
+				error = true;
+			}
+		} else if (pOutDir.compare(argv[i]) == 0) {
+			if (i + 1 < argc) {
+				outDir = argv[++i];
+			} else {
+				error = true;
+			}
+		} else if (pSrcDir.compare(argv[i]) == 0) {
+			if (i + 1 < argc) {
+				srcDir = argv[++i];
+			} else {
+				error = true;
+			}
+		} else {
+			error = true;
+		}
+	}
+
+	if (error) {
+		std::cerr << "Error parsing command-line arguments" << std::endl;
+	}
+
+	if (error || lang.empty() || module.empty() || srcDir.empty() || outDir.empty()) {
+		std::cerr << "Usage: HelpIndexer -lang ISOLangCode -mod HelpModule -srcdir SourceDir -zipdir OutputDir" << std::endl;
+		return 1;
+	}
+
+	std::string captionDir(srcDir + "/caption");
+	std::string contentDir(srcDir + "/content");
+	std::string indexDir(outDir + "/" + module + ".idxl");
+	HelpIndexer indexer(lang, module, captionDir, contentDir, indexDir);
+	if (!indexer.indexDocuments()) {
+		std::cerr << indexer.getErrorMessage() << std::endl;
+		return 2;
+	}
+	return 0;
+}
diff --git a/l10ntools/source/help/makefile.mk b/l10ntools/source/help/makefile.mk
index bab01b8..e22c6a3 100644
--- a/l10ntools/source/help/makefile.mk
+++ b/l10ntools/source/help/makefile.mk
@@ -60,8 +60,10 @@ SLOFILES=\
 EXCEPTIONSFILES=\
         $(OBJ)$/HelpLinker.obj \
         $(OBJ)$/HelpCompiler.obj \
+        $(OBJ)$/helpindexer.obj \
         $(SLO)$/HelpLinker.obj \
         $(SLO)$/HelpCompiler.obj
+
 .IF "$(OS)" == "MACOSX" && "$(CPU)" == "P" && "$(COM)" == "GCC"
 # There appears to be a GCC 4.0.1 optimization error causing _file:good() to
 # report true right before the call to writeOut at HelpLinker.cxx:1.12 l. 954
@@ -72,6 +74,9 @@ NOOPTFILES=\
         $(SLO)$/HelpLinker.obj
 .ENDIF
 
+PKGCONFIG_MODULES=libclucene-core
+.INCLUDE : pkg_config.mk
+
 APP1TARGET= $(TARGET)
 APP1OBJS=\
       $(OBJ)$/HelpLinker.obj \
@@ -79,6 +84,12 @@ APP1OBJS=\
 APP1RPATH = NONE
 APP1STDLIBS+=$(SALLIB) $(BERKELEYLIB) $(XSLTLIB) $(EXPATASCII3RDLIB)
 
+APP2TARGET=HelpIndexer
+APP2OBJS=\
+      $(OBJ)$/helpindexer.obj
+APP2RPATH = NONE
+APP2STDLIBS+=$(SALLIB) $(PKGCONFIG_LIBS)
+
 SHL1TARGET	=$(LIBBASENAME)$(DLLPOSTFIX)
 SHL1LIBS=	$(SLB)$/$(TARGET).lib
 .IF "$(COM)" == "MSC"
@@ -93,26 +104,7 @@ SHL1USE_EXPORTS	=ordinal
 DEF1NAME	=$(SHL1TARGET) 
 DEFLIB1NAME	=$(TARGET)
 
-JAVAFILES = \
-    HelpIndexerTool.java			        \
-    HelpFileDocument.java
-
-
-JAVACLASSFILES = \
-    $(CLASSDIR)$/$(PACKAGE)$/HelpIndexerTool.class			        \
-    $(CLASSDIR)$/$(PACKAGE)$/HelpFileDocument.class
 
-.IF "$(SYSTEM_LUCENE)" == "YES"
-EXTRAJARFILES += $(LUCENE_CORE_JAR) $(LUCENE_ANALYZERS_JAR)
-.ELSE
-JARFILES += lucene-core-2.3.jar lucene-analyzers-2.3.jar
-.ENDIF
-JAVAFILES = $(subst,$(CLASSDIR)$/$(PACKAGE)$/, $(subst,.class,.java $(JAVACLASSFILES)))
-
-JARCLASSDIRS	   = $(PACKAGE)/*
-JARTARGET	       = HelpIndexerTool.jar
-JARCOMPRESS        = TRUE 
- 
 # --- Targets ------------------------------------------------------
 
 .INCLUDE :  target.mk
-- 
1.7.0.4


--------------000405030600050500020300
Content-Type: text/x-patch;
 name="core-0002-separate-HelpIndexer-into-header-and-implementation.patch"
Content-Transfer-Encoding: 7bit
Content-Disposition: attachment;
 filename*0="core-0002-separate-HelpIndexer-into-header-and-implementatio";
 filename*1="n.patch"



More information about the LibreOffice mailing list