[PATCH] add clucene helpindexer program
Caolán McNamara
caolanm at redhat.com
Tue Feb 14 03:53:27 PST 2012
---
l10ntools/prj/build.lst | 2 +-
l10ntools/prj/d.lst | 6 +-
l10ntools/source/help/helpindexer.cxx | 247 +++++++++++++++++++++++++++++++++
l10ntools/source/help/makefile.mk | 30 ++---
4 files changed, 263 insertions(+), 22 deletions(-)
create mode 100644 l10ntools/source/help/helpindexer.cxx
diff --git a/l10ntools/prj/build.lst b/l10ntools/prj/build.lst
index 3cce7a3..c714256 100644
--- a/l10ntools/prj/build.lst
+++ b/l10ntools/prj/build.lst
@@ -1,4 +1,4 @@
-tr l10ntools : tools LIBXSLT:libxslt BERKELEYDB:berkeleydb LUCENE:lucene NULL
+tr l10ntools : tools LIBXSLT:libxslt BERKELEYDB:berkeleydb NULL
tr l10ntools usr1 - all tr_mkout NULL
tr l10ntools\inc nmake - all tr_inc NULL
tr l10ntools\source nmake - all tr_src tr_inc NULL
diff --git a/l10ntools/prj/d.lst b/l10ntools/prj/d.lst
index eded848..174bb6c 100644
--- a/l10ntools/prj/d.lst
+++ b/l10ntools/prj/d.lst
@@ -26,12 +26,14 @@ mkdir: %_DEST%\bin\help\com\sun\star\help
..\%__SRC%\bin\txtconv %_DEST%\bin\txtconv
..\%__SRC%\bin\ulfconv %_DEST%\bin\ulfconv
..\%__SRC%\class\FCFGMerge.jar %_DEST%\bin\FCFGMerge.jar
-..\%__SRC%\class\HelpIndexerTool.jar %_DEST%\bin\HelpIndexerTool.jar
-..\%__SRC%\bin\HelpLinker %_DEST%\bin\HelpLinker
..\%__SRC%\bin\HelpCompiler %_DEST%\bin\HelpCompiler
..\%__SRC%\bin\HelpCompiler.exe %_DEST%\bin\HelpCompiler.exe
+..\%__SRC%\bin\HelpLinker %_DEST%\bin\HelpLinker
..\%__SRC%\bin\HelpLinker.exe %_DEST%\bin\HelpLinker.exe
..\%__SRC%\bin\HelpLinker* %_DEST%\bin
+..\%__SRC%\bin\HelpIndexer %_DEST%\bin\HelpIndexer
+..\%__SRC%\bin\HelpIndexer.exe %_DEST%\bin\HelpIndexer.exe
+..\%__SRC%\bin\HelpIndexer* %_DEST%\bin
..\scripts\localize %_DEST%\bin\localize
..\scripts\fast_merge.pl %_DEST%\bin\fast_merge.pl
diff --git a/l10ntools/source/help/helpindexer.cxx b/l10ntools/source/help/helpindexer.cxx
new file mode 100644
index 0000000..c327119
--- /dev/null
+++ b/l10ntools/source/help/helpindexer.cxx
@@ -0,0 +1,247 @@
+#include <CLucene/StdHeader.h>
+#include <CLucene.h>
+#ifdef TODO
+#include <CLucene/analysis/LanguageBasedAnalyzer.h>
+#endif
+
+#include <unistd.h>
+#include <sys/stat.h>
+#include <dirent.h>
+#include <errno.h>
+#include <string.h>
+
+#include <string>
+#include <iostream>
+#include <algorithm>
+#include <set>
+
+// I assume that TCHAR is defined as wchar_t throughout
+
+using namespace lucene::document;
+
+class HelpIndexer {
+ private:
+ std::string d_lang;
+ std::string d_module;
+ std::string d_captionDir;
+ std::string d_contentDir;
+ std::string d_indexDir;
+ std::string d_error;
+ std::set<std::string> d_files;
+
+ public:
+
+ /**
+ * @param lang Help files language.
+ * @param module The module of the helpfiles.
+ * @param captionDir The directory to scan for caption files.
+ * @param contentDir The directory to scan for content files.
+ * @param indexDir The directory to write the index to.
+ */
+ HelpIndexer(std::string const &lang, std::string const &module,
+ std::string const &captionDir, std::string const &contentDir,
+ std::string const &indexDir);
+
+ /**
+ * Run the indexer.
+ * @return true if index successfully generated.
+ */
+ bool indexDocuments();
+
+ /**
+ * Get the error string (empty if no error occurred).
+ */
+ std::string const & getErrorMessage();
+
+ private:
+
+ /**
+ * Scan the caption & contents directories for help files.
+ */
+ bool scanForFiles();
+
+ /**
+ * Scan for files in the given directory.
+ */
+ bool scanForFiles(std::string const &path);
+
+ /**
+ * Fill the Document with information on the given help file.
+ */
+ bool helpDocument(std::string const & fileName, Document *doc);
+
+ /**
+ * Create a reader for the given file, and create an "empty" reader in case the file doesn't exist.
+ */
+ lucene::util::Reader *helpFileReader(std::string const & path);
+
+ std::wstring string2wstring(std::string const &source);
+};
+
+HelpIndexer::HelpIndexer(std::string const &lang, std::string const &module,
+ std::string const &captionDir, std::string const &contentDir, std::string const &indexDir) :
+d_lang(lang), d_module(module), d_captionDir(captionDir), d_contentDir(contentDir), d_indexDir(indexDir), d_error(""), d_files() {}
+
+bool HelpIndexer::indexDocuments() {
+ if (!scanForFiles()) {
+ return false;
+ }
+
+#ifdef TODO
+ // Construct the analyzer appropriate for the given language
+ lucene::analysis::Analyzer *analyzer = (
+ d_lang.compare("ja") == 0 ?
+ (lucene::analysis::Analyzer*)new lucene::analysis::LanguageBasedAnalyzer(L"cjk") :
+ (lucene::analysis::Analyzer*)new lucene::analysis::standard::StandardAnalyzer());
+#else
+ lucene::analysis::Analyzer *analyzer = (
+ (lucene::analysis::Analyzer*)new lucene::analysis::standard::StandardAnalyzer());
+#endif
+
+ lucene::index::IndexWriter writer(d_indexDir.c_str(), analyzer, true);
+
+ // Index the identified help files
+ Document doc;
+ for (std::set<std::string>::iterator i = d_files.begin(); i != d_files.end(); ++i) {
+ doc.clear();
+ if (!helpDocument(*i, &doc)) {
+ delete analyzer;
+ return false;
+ }
+ writer.addDocument(&doc);
+ }
+
+ // Optimize the index
+ writer.optimize();
+
+ delete analyzer;
+ return true;
+}
+
+std::string const & HelpIndexer::getErrorMessage() {
+ return d_error;
+}
+
+bool HelpIndexer::scanForFiles() {
+ if (!scanForFiles(d_contentDir)) {
+ return false;
+ }
+ if (!scanForFiles(d_captionDir)) {
+ return false;
+ }
+ return true;
+}
+
+bool HelpIndexer::scanForFiles(std::string const & path) {
+ DIR *dir = opendir(path.c_str());
+ if (dir == 0) {
+ d_error = "Error reading directory " + path + strerror(errno);
+ return true;
+ }
+
+ struct dirent *ent;
+ struct stat info;
+ while ((ent = readdir(dir)) != 0) {
+ if (stat((path + "/" + ent->d_name).c_str(), &info) == 0 && S_ISREG(info.st_mode)) {
+ d_files.insert(ent->d_name);
+ }
+ }
+
+ closedir(dir);
+
+ return true;
+}
+
+bool HelpIndexer::helpDocument(std::string const & fileName, Document *doc) {
+ // Add the help path as an indexed, untokenized field.
+ std::wstring path(L"#HLP#" + string2wstring(d_module) + L"/" + string2wstring(fileName));
+ doc->add(*new Field(_T("path"), path.c_str(), Field::STORE_YES | Field::INDEX_UNTOKENIZED));
+
+ // Add the caption as a field.
+ std::string captionPath = d_captionDir + "/" + fileName;
+ doc->add(*new Field(_T("caption"), helpFileReader(captionPath), Field::STORE_NO | Field::INDEX_TOKENIZED));
+ // FIXME: does the Document take responsibility for the FileReader or should I free it somewhere?
+
+ // Add the content as a field.
+ std::string contentPath = d_contentDir + "/" + fileName;
+ doc->add(*new Field(_T("content"), helpFileReader(contentPath), Field::STORE_NO | Field::INDEX_TOKENIZED));
+ // FIXME: does the Document take responsibility for the FileReader or should I free it somewhere?
+
+ return true;
+}
+
+lucene::util::Reader *HelpIndexer::helpFileReader(std::string const & path) {
+ if (access(path.c_str(), R_OK) == 0) {
+ return new lucene::util::FileReader(path.c_str(), "UTF-8");
+ } else {
+ return new lucene::util::StringReader(L"");
+ }
+}
+
+std::wstring HelpIndexer::string2wstring(std::string const &source) {
+ std::wstring target(source.length(), L' ');
+ std::copy(source.begin(), source.end(), target.begin());
+ return target;
+}
+
+int main(int argc, char **argv) {
+ const std::string pLang("-lang");
+ const std::string pModule("-mod");
+ const std::string pOutDir("-zipdir");
+ const std::string pSrcDir("-srcdir");
+
+ std::string lang;
+ std::string module;
+ std::string srcDir;
+ std::string outDir;
+
+ bool error = false;
+ for (int i = 1; i < argc; ++i) {
+ if (pLang.compare(argv[i]) == 0) {
+ if (i + 1 < argc) {
+ lang = argv[++i];
+ } else {
+ error = true;
+ }
+ } else if (pModule.compare(argv[i]) == 0) {
+ if (i + 1 < argc) {
+ module = argv[++i];
+ } else {
+ error = true;
+ }
+ } else if (pOutDir.compare(argv[i]) == 0) {
+ if (i + 1 < argc) {
+ outDir = argv[++i];
+ } else {
+ error = true;
+ }
+ } else if (pSrcDir.compare(argv[i]) == 0) {
+ if (i + 1 < argc) {
+ srcDir = argv[++i];
+ } else {
+ error = true;
+ }
+ } else {
+ error = true;
+ }
+ }
+
+ if (error) {
+ std::cerr << "Error parsing command-line arguments" << std::endl;
+ }
+
+ if (error || lang.empty() || module.empty() || srcDir.empty() || outDir.empty()) {
+ std::cerr << "Usage: HelpIndexer -lang ISOLangCode -mod HelpModule -srcdir SourceDir -zipdir OutputDir" << std::endl;
+ return 1;
+ }
+
+ std::string captionDir(srcDir + "/caption");
+ std::string contentDir(srcDir + "/content");
+ std::string indexDir(outDir + "/" + module + ".idxl");
+ HelpIndexer indexer(lang, module, captionDir, contentDir, indexDir);
+ if (!indexer.indexDocuments()) {
+ std::cerr << indexer.getErrorMessage() << std::endl;
+ return 2;
+ }
+ return 0;
+}
diff --git a/l10ntools/source/help/makefile.mk b/l10ntools/source/help/makefile.mk
index bab01b8..e22c6a3 100644
--- a/l10ntools/source/help/makefile.mk
+++ b/l10ntools/source/help/makefile.mk
@@ -60,8 +60,10 @@ SLOFILES=\
EXCEPTIONSFILES=\
$(OBJ)$/HelpLinker.obj \
$(OBJ)$/HelpCompiler.obj \
+ $(OBJ)$/helpindexer.obj \
$(SLO)$/HelpLinker.obj \
$(SLO)$/HelpCompiler.obj
+
.IF "$(OS)" == "MACOSX" && "$(CPU)" == "P" && "$(COM)" == "GCC"
# There appears to be a GCC 4.0.1 optimization error causing _file:good() to
# report true right before the call to writeOut at HelpLinker.cxx:1.12 l. 954
@@ -72,6 +74,9 @@ NOOPTFILES=\
$(SLO)$/HelpLinker.obj
.ENDIF
+PKGCONFIG_MODULES=libclucene-core
+.INCLUDE : pkg_config.mk
+
APP1TARGET= $(TARGET)
APP1OBJS=\
$(OBJ)$/HelpLinker.obj \
@@ -79,6 +84,12 @@ APP1OBJS=\
APP1RPATH = NONE
APP1STDLIBS+=$(SALLIB) $(BERKELEYLIB) $(XSLTLIB) $(EXPATASCII3RDLIB)
+APP2TARGET=HelpIndexer
+APP2OBJS=\
+ $(OBJ)$/helpindexer.obj
+APP2RPATH = NONE
+APP2STDLIBS+=$(SALLIB) $(PKGCONFIG_LIBS)
+
SHL1TARGET =$(LIBBASENAME)$(DLLPOSTFIX)
SHL1LIBS= $(SLB)$/$(TARGET).lib
.IF "$(COM)" == "MSC"
@@ -93,26 +104,7 @@ SHL1USE_EXPORTS =ordinal
DEF1NAME =$(SHL1TARGET)
DEFLIB1NAME =$(TARGET)
-JAVAFILES = \
- HelpIndexerTool.java \
- HelpFileDocument.java
-
-
-JAVACLASSFILES = \
- $(CLASSDIR)$/$(PACKAGE)$/HelpIndexerTool.class \
- $(CLASSDIR)$/$(PACKAGE)$/HelpFileDocument.class
-.IF "$(SYSTEM_LUCENE)" == "YES"
-EXTRAJARFILES += $(LUCENE_CORE_JAR) $(LUCENE_ANALYZERS_JAR)
-.ELSE
-JARFILES += lucene-core-2.3.jar lucene-analyzers-2.3.jar
-.ENDIF
-JAVAFILES = $(subst,$(CLASSDIR)$/$(PACKAGE)$/, $(subst,.class,.java $(JAVACLASSFILES)))
-
-JARCLASSDIRS = $(PACKAGE)/*
-JARTARGET = HelpIndexerTool.jar
-JARCOMPRESS = TRUE
-
# --- Targets ------------------------------------------------------
.INCLUDE : target.mk
--
1.7.7.6
--=-sGkLLrmSIZ7o5tzwMZXA
Content-Disposition: attachment; filename="0001-use-clucene-indexer.patch"
Content-Type: text/x-patch; name="0001-use-clucene-indexer.patch"; charset="UTF-8"
Content-Transfer-Encoding: 7bit
More information about the LibreOffice
mailing list