[Libreoffice-commits] core.git: Branch 'feature/ooxml-analyze' - bin/ooxml-analyze.py

Gülşah Köse (via logerrit) logerrit at kemper.freedesktop.org
Wed May 26 13:47:59 UTC 2021


 bin/ooxml-analyze.py |   28 ++++++++++++++++++++++++----
 1 file changed, 24 insertions(+), 4 deletions(-)

New commits:
commit dbb7762b1235ae245fd5b67046737edf5519fbd9
Author:     Gülşah Köse <gulsah.kose at collabora.com>
AuthorDate: Wed May 26 16:47:12 2021 +0300
Commit:     Gülşah Köse <gulsah.kose at collabora.com>
CommitDate: Wed May 26 16:47:12 2021 +0300

    Replace namespaces with namespace aliases on result
    
    Change-Id: If29c0b5d9eb52a7d42a1d1482010653d2714c8fe

diff --git a/bin/ooxml-analyze.py b/bin/ooxml-analyze.py
index efc44bbfa32c..3f9b0e8bdad1 100755
--- a/bin/ooxml-analyze.py
+++ b/bin/ooxml-analyze.py
@@ -33,8 +33,8 @@ def main(argv):
     if(extracted_files_dir_by_user == ''):
         # use default directory path for extracted ooxml files.
         extracted_files_dir = os.path.join(outputdir, 'extractedfiles')
-
         extract_files(inputdir, extracted_files_dir)
+
         count_elements(extracted_files_dir, result_list)
     else:
         # use user defined directory path for extracted ooxml files.
@@ -58,17 +58,36 @@ def extract_files(inputdir, extracted_files_dir):
             filename.endswith(".docx") or       \
             filename.endswith(".xlsx")) and not \
             filename.startswith("~"):
-
             filepath = os.path.join(inputdir, filename)
             extracted_file_path = os.path.join(extracted_files_dir, str(counter))
 
             with ZipFile(filepath) as zipObj:
                 zipObj.extractall(extracted_file_path)
 
-            counter += 1
+            counter +=1
         else:
             continue
 
+# get key of value in dictionary
+def get_key(val, dict):
+    for key, value in dict.items():
+         if val == value:
+             return str(key)
+    return ''
+
+# replace curlybrace namespaces with the shorten ones
+def replace_namespace_with_alias(filename, element):
+    namespaces = dict([node for _, node in ET.iterparse(filename, events=['start-ns'])])
+    i = element.find('}')
+    if i>=0:
+        element_ns = element[1:i]
+        element_ns_alias = get_key(element_ns, namespaces)
+        if element_ns_alias !='':
+            element = element.replace("{" + element_ns + "}", element_ns_alias + ":")
+        else:
+            element = element.replace("{" + element_ns + "}", "")
+    return element
+
 # counts tags, attribute names and values of xmls
 def count_elements(extracted_files_dir, result_list):
 
@@ -87,7 +106,7 @@ def count_elements(extracted_files_dir, result_list):
 
             # start to count
             for child in root.iter():
-                tag = str(child.tag)
+                tag = replace_namespace_with_alias(xmlfile, child.tag)
                 tag_idx = get_index_of_tag(tag, result_list)
 
                 # count tags
@@ -99,6 +118,7 @@ def count_elements(extracted_files_dir, result_list):
 
                 # count attribute names and values of current tag
                 for attr_name, attr_value in child.attrib.items():
+                    attr_name = replace_namespace_with_alias(xmlfile, attr_name)
                     if not attr_name in result_list[tag_idx][1].keys():
                         result_list[tag_idx][1][attr_name] = 1
                     else:


More information about the Libreoffice-commits mailing list