[Libreoffice-commits] core.git: Branch 'feature/ooxml-analyze' - bin/ooxml-analyze.py

Tue Jul 20 11:22:14 UTC 2021

bin/ooxml-analyze.py |  100 +++++++++++++++++++++------------------------------
 1 file changed, 42 insertions(+), 58 deletions(-)

New commits:
commit 18e89687fde3b3cfac00ead00cbefbb98262cdfe
Author:     Gülşah Köse <gulsah.kose at collabora.com>
AuthorDate: Tue Jul 20 14:15:42 2021 +0300
Commit:     Gülşah Köse <gulsah.kose at collabora.com>
CommitDate: Tue Jul 20 14:20:55 2021 +0300

    remove namespace replacing and some small updates
    
    Change-Id: I2d56668186c8745fca683025710646ae505a0d6b

diff --git a/bin/ooxml-analyze.py b/bin/ooxml-analyze.py
index 87acd377c854..cc7a7a036980 100755
--- a/bin/ooxml-analyze.py
+++ b/bin/ooxml-analyze.py
@@ -80,21 +80,22 @@ def extract_files(inputdir, extracted_files_dir):
         shutil.rmtree(extracted_files_dir)
 
     # unzip files into the extracted files directory
-    for filename in os.listdir(inputdir):
-        if (filename.endswith(".pptx") or       \
-            filename.endswith(".docx") or       \
-            filename.endswith(".xlsx")) and not \
-            filename.startswith("~"):
-            filepath = os.path.join(inputdir, filename)
-            extracted_file_path = os.path.join(extracted_files_dir, filename)
-
-            try:
-                with ZipFile(filepath) as zipObj:
-                    zipObj.extractall(extracted_file_path)
-            except:
-                print("%s is problematic" % filename)
-        else:
-            continue
+    for filetype in get_list_of_subdir(inputdir):
+        for filename in os.listdir(filetype):
+            if (filename.endswith(".pptx") or       \
+                filename.endswith(".docx") or       \
+                filename.endswith(".xlsx")) and not \
+                filename.startswith("~"):
+                filepath = os.path.join(filetype, filename)
+                extracted_file_path = os.path.join(extracted_files_dir, filename)
+
+                try:
+                    with ZipFile(filepath) as zipObj:
+                        zipObj.extractall(extracted_file_path)
+                except:
+                    print("%s is problematic" % filename)
+            else:
+                continue
 
 # get key of value in dictionary
 def get_key(val, dict):
@@ -116,34 +117,17 @@ def replace_namespace_with_alias(filename, element):
             element = element.replace("{" + element_ns + "}", "")
     return element
 
-# decides which files shouldn't be analyzed.
+# decides which files should/shouldn't be analyzed.
 def is_file_in_accepted_files(filename):
-    if(filename.endswith("[Content_Types].xml") or \
-       filename.endswith("docProps/custom.xml") or \
-       filename.endswith("docProps/app.xml") or    \
-       filename.endswith("presentation.xml") or \
-       filename.endswith("viewProps.xml") or \
-       filename.endswith("tableStyles.xml") or \
-       filename.endswith("presProps.xml") or \
-       "ppt/slideLayouts" in filename or \
-       "ppt/slideMasters" in filename or \
-       "ppt/theme" in filename or \
-       "ppt/notesMasters" in filename or \
-       "ppt/notesSlides" in filename or \
-       "ppt/handoutMasters" in filename or \
-       "ppt/tags" in filename or \
-       "pptx/customXml" in filename or \
-       "ppt/diagrams" in filename or \
-       filename.endswith("docProps/core.xml") or not \
-       filename.endswith(".xml")):
-       return False
-
-    return True
+    if(filename.endswith(".xml") and "ppt/slides/" in filename):
+       return True
+
+    return False
 
 # counts tags, attribute names and values of xmls
 def count_elements(extracted_files_dir, result_list, concanated_texts_list):
 
-    # make sure if extracted files directory exist
+    # make sure if extracted files directory not exist
     if not (os.path.exists(extracted_files_dir)):
         print("Extracted files directory is not exist")
         return
@@ -160,7 +144,7 @@ def count_elements(extracted_files_dir, result_list, concanated_texts_list):
         try:
             # start to count
             for event, child in etree.iterparse(xmlfile, events=('start', 'end')):
-                tag = replace_namespace_with_alias(xmlfile, child.tag)
+                tag = child.tag #replace_namespace_with_alias(xmlfile, child.tag)
                 tag_idx = get_index_of_tag(tag, result_list)
 
                 if event == "start":
@@ -171,30 +155,29 @@ def count_elements(extracted_files_dir, result_list, concanated_texts_list):
                     else:
                         result_list[tag_idx][0][tag] += 1
 
-                    # count attribute names and values of current tag
-                    for attr_name, attr_value in child.attrib.items():
-                        attr_name = replace_namespace_with_alias(xmlfile, attr_name)
-                        if not attr_name in result_list[tag_idx][1].keys():
-                            result_list[tag_idx][1][attr_name] = 1
-                        else:
-                            result_list[tag_idx][1][attr_name] +=1
-
-                        if not attr_value in result_list[tag_idx][2].keys():
-                            result_list[tag_idx][2][attr_value] = 1
-                        else:
-                            result_list[tag_idx][2][attr_value] +=1
-
-                    # concanated text will be resetted in every paragraph begining
-                    if tag == "a:p":
+                    #count attribute names and values of current tag
+                    #for attr_name, attr_value in child.attrib.items():
+                    #    attr_name = replace_namespace_with_alias(xmlfile, attr_name)
+                    #    if not attr_name in result_list[tag_idx][1].keys():
+                    #        result_list[tag_idx][1][attr_name] = 1
+                    #    else:
+                    #        result_list[tag_idx][1][attr_name] +=1
+
+                    #    if not attr_value in result_list[tag_idx][2].keys():
+                    #        result_list[tag_idx][2][attr_value] = 1
+                    #    else:
+                    #        result_list[tag_idx][2][attr_value] +=1
+
+                    # concanated text will be resetted in every paragraph beginning
+                    if tag == "{http://schemas.openxmlformats.org/drawingml/2006/main}p":
                         concatenated_text = ""
 
-
                 if event == "end":
                     # Detect seperate texts in paragraph and concanate them.
-                    if tag == "a:t" and str(child.text) != "None":
+                    if tag == "{http://schemas.openxmlformats.org/drawingml/2006/main}t" and str(child.text) != "None":
                         concatenated_text += str(child.text)
                     # End of the paragraph element, add the text as list item.
-                    if tag == "a:p" and concatenated_text != "":
+                    if tag == "{http://schemas.openxmlformats.org/drawingml/2006/main}p" and concatenated_text != "":
                         concanated_texts_list.append(concatenated_text)
 
                     # count text contents except consisted of whitespaces.
@@ -203,8 +186,9 @@ def count_elements(extracted_files_dir, result_list, concanated_texts_list):
                             result_list[tag_idx][3][child.text] = 1
                         else:
                             result_list[tag_idx][3][child.text] += 1
+
         except Exception as exception:
-            print("%s has %s " % xmlfile, exception)
+            print("%s has %s " % (xmlfile, exception))
 
 # gets the position of "tag" element in result list. If element is not exist,
 # return -1 that points the last index of the list.