[Libreoffice-commits] .: 2 commits - dictionaries/hu_HU dictionaries/util

Andras Timar timar at kemper.freedesktop.org
Sat Jan 29 04:27:15 PST 2011


 dictionaries/hu_HU/th_hu_HU_v2.dat |    2 
 dictionaries/util/th_check.pl      |  105 +++++++++++++++++++++++++++++++++++++
 2 files changed, 106 insertions(+), 1 deletion(-)

New commits:
commit 89fc507a0e278381ad012b5be9a964cb0e22c0e7
Author: Steve Butler <sebutler at gmail.com>
Date:   Sat Jan 29 13:26:01 2011 +0100

    th_check.pl for syntax check of thesaurus .dat files

diff --git a/dictionaries/util/th_check.pl b/dictionaries/util/th_check.pl
new file mode 100755
index 0000000..04acc3c
--- /dev/null
+++ b/dictionaries/util/th_check.pl
@@ -0,0 +1,105 @@
+:
+eval 'exec perl -wS $0 ${1+"$@"}'
+    if 0;
+
+# Version: MPL 1.1 / GPLv3+ / LGPLv3+
+#
+# The contents of this file are subject to the Mozilla Public License Version
+# 1.1 (the "License"); you may not use this file except in compliance with
+# the License or as specified alternatively below. You may obtain a copy of
+# the License at http://www.mozilla.org/MPL/
+#
+# Software distributed under the License is distributed on an "AS IS" basis,
+# WITHOUT WARRANTY OF ANY KIND, either express or implied. See the License
+# for the specific language governing rights and limitations under the
+# License.
+#
+# The Initial Developer of the Original Code is
+#       Steven Butler <sebutler at gmail.com>
+# Portions created by the Initial Developer are Copyright (C) 2011 the
+# Initial Developer. All Rights Reserved.
+#
+# For minor contributions see the git repository.
+#
+# Alternatively, the contents of this file may be used under the terms of
+# either the GNU General Public License Version 3 or later (the "GPLv3+"), or
+# the GNU Lesser General Public License Version 3 or later (the "LGPLv3+"),
+# in which case the provisions of the GPLv3+ or the LGPLv3+ are applicable
+# instead of those above.
+
+use strict;
+
+sub processFile($) {
+    my ($input) = @_;
+
+    if (!open(INPUT, $input)) {
+        print "FAIL: $input (no input found)\n";
+        return 1;
+    }
+    # top line of thesaurus provides encoding (we ignore it)
+    $_=<INPUT>;
+    my $line = 1;
+
+
+    my $expectedEntries;
+    my $actualEntries = 0;
+    my $word;
+    my %words = ();
+    my @errors = ();
+    while (<INPUT>){
+        $line++;
+        s/\n$//;
+        s/\r$//;
+        s/\s+$//;
+        if (m/^([^\|]+)\|(\d+)$/) {
+
+            my $tword = $1;
+            my $texpectedEntries = $2;
+            #print $tword, $texpectedEntries, "\n";
+            if (defined $expectedEntries) {
+                # Check if the last word's actual entries matched the expected
+                if ($actualEntries != $expectedEntries) {
+                    push @errors, "$words{$word}: $word defined to have $expectedEntries but seems to have $actualEntries (next word ($tword) found on line $line\n";
+                }
+            }
+            $word = $tword;
+            $expectedEntries = $texpectedEntries;
+            if (defined $words{$word}) {
+                push @errors, "$line: $word previously defined on $words{$word}\n";
+            } else {
+                $words{$word} = $line;
+            }
+            $actualEntries = 0;
+        } elsif (m/^[\(\-\|]/) {
+            $actualEntries++;
+        } else {
+            push @errors, "$line: Unrecognised line format: $_\n";
+            if (m/^(interj|prep|conj)\|/) {
+                $actualEntries++;
+            }
+        }
+
+    }
+    close(INPUT);
+
+
+    if (scalar(@errors)) {
+        print $input, ':', join($input.':', @errors);
+        return 1;
+    }
+    else {
+        return 0;
+    }
+}
+
+if (scalar(@ARGV) == 0) {
+    print "Usage: $0 <thesaurus .dat file>+\n";
+    print "\tscans for some common issues found in mythes format thesaurus files\n";
+    exit(1);
+}
+
+my $errors = 0;
+foreach (@ARGV) {
+    $errors += processFile($_);
+}
+exit($errors);
commit 6c615dfbc823c6f6cee30936837577806b6a7f4c
Author: Andras Timar <timar at fsf.hu>
Date:   Sat Jan 29 13:24:28 2011 +0100

    fixed a bug in hu thesaurus thanks to th_check.pl

diff --git a/dictionaries/hu_HU/th_hu_HU_v2.dat b/dictionaries/hu_HU/th_hu_HU_v2.dat
index 8d7c565..44353b8 100644
--- a/dictionaries/hu_HU/th_hu_HU_v2.dat
+++ b/dictionaries/hu_HU/th_hu_HU_v2.dat
@@ -296,8 +296,8 @@ sárgabarack|1
 hogy is hívják|1
 |izé|mi a neve
 vandál|1
-(|1
 |gátlástalan|kíméletlen|féktelen|
+(|1
 (U+0028)|(|nyitó zárójel|zárójel
 ennek következtében|2
 |egyszóval|akkor|így|azért|ezért|hát|szóval|tehát|vagyis|ennélfogva|következésképpen|egy szó mint száz|így hát|nahát|


More information about the Libreoffice-commits mailing list