[Libreoffice-commits] dictionaries.git: 2 commits - cs_CZ/thesaurus cs_CZ/thes_cs_CZ.dat

Jan Holesovsky kendy at collabora.com
Thu Feb 25 13:43:14 UTC 2016


 cs_CZ/thes_cs_CZ.dat                       |   11 ++-----
 cs_CZ/thesaurus/README.txt                 |    9 ++++++
 cs_CZ/thesaurus/blacklist.txt              |   11 +++++++
 cs_CZ/thesaurus/dictionary-to-thesaurus.py |   42 +++++++++++++++++------------
 4 files changed, 49 insertions(+), 24 deletions(-)

New commits:
commit f04d3f19b453f58411bcbbe7920cd5e16f8957e0
Author: Jan Holesovsky <kendy at collabora.com>
Date:   Thu Feb 25 14:37:44 2016 +0100

    Czech thesaurus: Blacklist some unhelpful meanings.
    
    Change-Id: I7d75626a37d4f241d8d407a11855325e39c5fa63

diff --git a/cs_CZ/thes_cs_CZ.dat b/cs_CZ/thes_cs_CZ.dat
index 95464e9..985bf23 100644
--- a/cs_CZ/thes_cs_CZ.dat
+++ b/cs_CZ/thes_cs_CZ.dat
@@ -5068,7 +5068,7 @@ aspekty|2
 aspidistra|1
 |druh rostliny
 aspik|4
-|zmije|huspenina|rosol
+|huspenina|rosol
 |džem|sulc
 (slov.)|sulcovat
 (přísl.)|želé
@@ -31828,7 +31828,7 @@ husa|1
 husa divoká|1
 |divoká husa
 huspenina|4
-|zmije|aspik|rosol
+|aspik|rosol
 |džem|sulc
 (slov.)|sulcovat
 (přísl.)|želé
@@ -113049,7 +113049,7 @@ rorýs|2
 rosný bod|1
 |bod kondenzace
 rosol|4
-|zmije|aspik|huspenina
+|aspik|huspenina
 |džem|sulc
 (slov.)|sulcovat
 (přísl.)|želé
@@ -134014,8 +134014,6 @@ sčítat|4
 |připočíst|přilít|přičíst, sečíst
 |částka|úhrn|suma|shrnout|sečíst|obnos, suma, součet
 (podst. jm.)|součet|souhrn|obnos
-sčítačka|1
-(podst. jm.)|zmije
 sčítání|6
 (slov.)|přidávání|přidání
 (podst. jm.)|adice|dodatek|přírůstek
@@ -170754,9 +170752,6 @@ zmeškat|3
 (podst. jm.)|slečinka|dívka
 (slov.)|propást|nezasáhnout|netrefit|zameškat|vynechat|opominout|uniknout|pochopit
 |děvče|promeškat|postrádat|minout|míjet
-zmije|2
-(podst. jm.)|sčítačka
-|aspik|huspenina|rosol
 zmijovitý|1
 (příd. jm.)|zmijí
 zmijí|1
diff --git a/cs_CZ/thesaurus/README.txt b/cs_CZ/thesaurus/README.txt
new file mode 100644
index 0000000..fb40d81
--- /dev/null
+++ b/cs_CZ/thesaurus/README.txt
@@ -0,0 +1,9 @@
+To generate a new thesaurus (eg. after adding terms to blacklist.txt):
+
+  ./dictionary-to-thesaurus.py slovnik_data_utf8.txt blacklist.txt > ../thes_cs_CZ.dat
+
+check the results:
+
+   git diff
+
+and commit & push as usually.
diff --git a/cs_CZ/thesaurus/blacklist.txt b/cs_CZ/thesaurus/blacklist.txt
index ab62ae5..38502ed 100644
--- a/cs_CZ/thesaurus/blacklist.txt
+++ b/cs_CZ/thesaurus/blacklist.txt
@@ -7,3 +7,5 @@
 	(by the way)
 	(po)štvat
 	14. písmeno hebrejské abecedy
+adder	zmije
+aspic	zmije
commit f83b25d29ff2ee17addec24aaebd15113475c360
Author: Jan Holesovsky <kendy at collabora.com>
Date:   Thu Feb 25 14:35:03 2016 +0100

    dictionary-to-thesaurus.py: Move blacklist to a separate file.
    
    Change-Id: Ie05e0c0ce8b4f9541a5a143ddf9ccf960940a3b7

diff --git a/cs_CZ/thesaurus/blacklist.txt b/cs_CZ/thesaurus/blacklist.txt
new file mode 100644
index 0000000..ab62ae5
--- /dev/null
+++ b/cs_CZ/thesaurus/blacklist.txt
@@ -0,0 +1,9 @@
+# Terms that are in the dictionary, but should be left out from thesaurus creation
+#
+# The words here are English Czech pairs, delimited by a TAB.  When one of
+# them is missing (is empty), it means "any".  Empty lines are ignored
+
+	?
+	(by the way)
+	(po)štvat
+	14. písmeno hebrejské abecedy
diff --git a/cs_CZ/thesaurus/dictionary-to-thesaurus.py b/cs_CZ/thesaurus/dictionary-to-thesaurus.py
index d4974ed..8ee022c 100755
--- a/cs_CZ/thesaurus/dictionary-to-thesaurus.py
+++ b/cs_CZ/thesaurus/dictionary-to-thesaurus.py
@@ -18,20 +18,13 @@ import os
 import re
 import sys
 
-# add here the Czech words we want to leave out from the thesaurus generation
-# (misbehaving, mistranslated, etc.)
-ignore_words = [
-    '?',
-    '(by the way)',
-    '(po)štvat',
-    '14. písmeno hebrejské abecedy',
-]
-
 def usage():
-    message = """Usage: {program} slovnik_data_utf8.txt
+    message = """Usage: {program} slovnik_data_utf8.txt backlist.txt
 
-  slovnik_data_utf8.txt: Dictionary data from http://slovnik.zcu.cz/download.php"""
-    print(message.format(program = os.path.basename(sys.argv[0])))
+  slovnik_data_utf8.txt: Dictionary data from http://slovnik.zcu.cz/download.php
+  blacklist.txt:         List of words that should be ignored when generating
+"""
+    sys.stderr.write(message.format(program = os.path.basename(sys.argv[0])))
 
 def classify(typ):
     if typ == '':
@@ -47,7 +40,18 @@ def classify(typ):
 
     return ''
 
-def parse(filename):
+def parse(filename, blacklistname):
+    blacklist = {}
+
+    with open(blacklistname, "r") as fp:
+        for line in fp:
+            if (line == ''):
+                continue
+            elif (line[0] == '#'):
+                continue
+            else:
+                blacklist[line.strip(' \n')] = 1
+
     synonyms = {}
     meanings = {}
 
@@ -73,7 +77,13 @@ def parse(filename):
                 if (word != '' and word[0] == '"' and word[len(word)-1] == '"'):
                     word = word.strip('" ')
 
-                if (word == '' or word in ignore_words):
+                if (word == ''):
+                    continue
+
+                if (index + '\t' + word in blacklist or
+                        index in blacklist or
+                        index + '\t' in blacklist or
+                        '\t' + word in blacklist):
                     continue
 
                 typ = ''
@@ -143,11 +153,11 @@ def buildThesaurus(synonyms, meanings):
                 print line
 
 def main(args):
-    if (len(args) != 2):
+    if (len(args) != 3):
         usage()
         sys.exit(1)
 
-    (synonyms, meanings) = parse(args[1])
+    (synonyms, meanings) = parse(args[1], args[2])
 
     print "UTF-8"
     buildThesaurus(synonyms, meanings)


More information about the Libreoffice-commits mailing list