[Libreoffice-commits] core.git: 4 commits - bin/get-bugzilla-attachments-by-mimetype

Sat Dec 7 07:49:46 PST 2013

bin/get-bugzilla-attachments-by-mimetype |  311 ++++++++++++++++++-------------
 1 file changed, 183 insertions(+), 128 deletions(-)

New commits:
commit da97677684049b5b7fa2e5f737f66269f16d5b3e
Author: David Tardon <dtardon at redhat.com>
Date:   Sat Dec 7 16:35:23 2013 +0100

    print bug count in rss query too
    
    Change-Id: I9542771fe3caf77757d81641752ab5463ead4deb

diff --git a/bin/get-bugzilla-attachments-by-mimetype b/bin/get-bugzilla-attachments-by-mimetype
index 1d8ee8b..4aed0e6 100755
--- a/bin/get-bugzilla-attachments-by-mimetype
+++ b/bin/get-bugzilla-attachments-by-mimetype
@@ -204,6 +204,7 @@ def get_through_rss_query(queryurl, mimetype, prefix, suffix):
         url = queryurl + '?' + '&'.join(['='.join(kv) for kv in query.iteritems()])
         print('url is ' + url)
         d = feedparser.parse(url)
+        print(str(len(d['entries'])) + ' bugs to process')
 
         if full:
             available = set([str(entry['id'].split('=')[-1]) for entry in d['entries']])
commit fa8d0cff3a8252c1e8476d724e12a7be50a557ab
Author: David Tardon <dtardon at redhat.com>
Date:   Sat Dec 7 16:31:32 2013 +0100

    put bz URLs into a list to avoid repeated code
    
    Change-Id: Iade61f598e1122aae7667de684a68e8164817327

diff --git a/bin/get-bugzilla-attachments-by-mimetype b/bin/get-bugzilla-attachments-by-mimetype
index 97d9d2a..1d8ee8b 100755
--- a/bin/get-bugzilla-attachments-by-mimetype
+++ b/bin/get-bugzilla-attachments-by-mimetype
@@ -281,23 +281,26 @@ def get_launchpad_bugs(prefix):
                 f.write(handle.read())
                 f.close()
 
-freedesktop = 'http://bugs.libreoffice.org/buglist.cgi'
-abisource = 'http://bugzilla.abisource.com/buglist.cgi' #added for abiword
-gentoo = 'http://bugs.gentoo.org/buglist.cgi'
-gnome = 'http://bugzilla.gnome.org/buglist.cgi' # added for gnumeric
-kde = 'http://bugs.kde.org/buglist.cgi' # added for koffice/calligra
-openoffice = 'https://issues.apache.org/ooo/buglist.cgi'
+rss_bugzillas = {
+    'abi': 'http://bugzilla.abisource.com/buglist.cgi', #added for abiword
+    'fdo': 'http://bugs.libreoffice.org/buglist.cgi',
+    'gentoo': 'http://bugs.gentoo.org/buglist.cgi',
+    'gnome': 'http://bugzilla.gnome.org/buglist.cgi', # added for gnumeric
+    'kde': 'http://bugs.kde.org/buglist.cgi', # added for koffice/calligra
+    'mandriva': 'https://qa.mandriva.com/buglist.cgi',
+    'moz': 'https://bugzilla.mozilla.org/buglist.cgi',
+    'novell': 'https://bugzilla.novell.com/buglist.cgi',
+    'ooo': 'https://issues.apache.org/ooo/buglist.cgi',
+}
+
 redhatrpc = 'https://bugzilla.redhat.com/xmlrpc.cgi'
 redhatbug = 'https://bugzilla.redhat.com/show_bug.cgi?id='
-mozilla = 'https://bugzilla.mozilla.org/buglist.cgi'
-mandriva = 'https://qa.mandriva.com/buglist.cgi'
 
 #Novell Bugzilla requires users to log in in order to get details of the bugs such as attachment bodies etc.
 #As a dirty workaround, we parse comments containing "Created an attachment (id=xxxxxx)" and download attachments manually
 #python-bugzilla claims that it supports Novell bugzilla login but it's not working right now and novell bugzilla login
 #system is a nightmare
 novellattach = 'https://bugzilla.novell.com/attachment.cgi?id='
-novell = 'https://bugzilla.novell.com/buglist.cgi'
 
 mimetypes = {
 # ODF
@@ -442,36 +445,13 @@ common_noncore_mimetypes = {
     'application/pdf': 'pdf',
 }
 
-for (mimetype,extension) in mimetypes.items():
-    get_through_rss_query(freedesktop, mimetype, "fdo", extension)
+for (prefix, uri) in rss_bugzillas.items():
+    for (mimetype,extension) in mimetypes.items():
+        get_through_rss_query(uri, mimetype, prefix, extension)
 
 for (mimetype,extension) in mimetypes.items():
     get_through_rpc_query(redhatrpc, redhatbug, mimetype, "rhbz", extension)
 
-for (mimetype,extension) in mimetypes.items():
-    get_through_rss_query(openoffice, mimetype, "ooo", extension)
-
-for (mimetype,extension) in mimetypes.items():
-    get_through_rss_query(novell, mimetype, "novell", extension)
-
-for (mimetype,extension) in mimetypes.items():
-    get_through_rss_query(gnome, mimetype, "gnome", extension)
-
-for (mimetype,extension) in mimetypes.items():
-    get_through_rss_query(abisource, mimetype, "abi", extension)
-
-for (mimetype,extension) in mimetypes.items():
-    get_through_rss_query(kde, mimetype, "kde", extension)
-
-for (mimetype,extension) in mimetypes.items():
-    get_through_rss_query(gentoo, mimetype, "gentoo", extension)
-
-for (mimetype,extension) in mimetypes.items():
-    get_through_rss_query(mozilla, mimetype, "moz", extension)
-
-for (mimetype,extension) in mimetypes.items():
-    get_through_rss_query(mandriva, mimetype, "mandriva", extension)
-
 try:
     get_launchpad_bugs("lp")
 except ImportError:
commit 1ac7e398553b9fa998bac3d6a36fe50ceb5809d5
Author: David Tardon <dtardon at redhat.com>
Date:   Sat Dec 7 15:30:19 2013 +0100

    enable more bugzillas
    
    Change-Id: If77b213deba85d49e1e8d09015fa81aa6df6aa94

diff --git a/bin/get-bugzilla-attachments-by-mimetype b/bin/get-bugzilla-attachments-by-mimetype
index 3957c0c..97d9d2a 100755
--- a/bin/get-bugzilla-attachments-by-mimetype
+++ b/bin/get-bugzilla-attachments-by-mimetype
@@ -283,12 +283,14 @@ def get_launchpad_bugs(prefix):
 
 freedesktop = 'http://bugs.libreoffice.org/buglist.cgi'
 abisource = 'http://bugzilla.abisource.com/buglist.cgi' #added for abiword
+gentoo = 'http://bugs.gentoo.org/buglist.cgi'
 gnome = 'http://bugzilla.gnome.org/buglist.cgi' # added for gnumeric
 kde = 'http://bugs.kde.org/buglist.cgi' # added for koffice/calligra
 openoffice = 'https://issues.apache.org/ooo/buglist.cgi'
 redhatrpc = 'https://bugzilla.redhat.com/xmlrpc.cgi'
 redhatbug = 'https://bugzilla.redhat.com/show_bug.cgi?id='
 mozilla = 'https://bugzilla.mozilla.org/buglist.cgi'
+mandriva = 'https://qa.mandriva.com/buglist.cgi'
 
 #Novell Bugzilla requires users to log in in order to get details of the bugs such as attachment bodies etc.
 #As a dirty workaround, we parse comments containing "Created an attachment (id=xxxxxx)" and download attachments manually
@@ -461,6 +463,15 @@ for (mimetype,extension) in mimetypes.items():
 for (mimetype,extension) in mimetypes.items():
     get_through_rss_query(kde, mimetype, "kde", extension)
 
+for (mimetype,extension) in mimetypes.items():
+    get_through_rss_query(gentoo, mimetype, "gentoo", extension)
+
+for (mimetype,extension) in mimetypes.items():
+    get_through_rss_query(mozilla, mimetype, "moz", extension)
+
+for (mimetype,extension) in mimetypes.items():
+    get_through_rss_query(mandriva, mimetype, "mandriva", extension)
+
 try:
     get_launchpad_bugs("lp")
 except ImportError:
commit 93b6e31c706cb5b98286fa6368f5483c26ff1505
Author: David Tardon <dtardon at redhat.com>
Date:   Sat Dec 7 13:15:36 2013 +0100

    try to minimize bugzilla queries
    
    This attempts to solve several deficiencies in the script:
    1. If the first attachment of a bug is already downloaded, the bug is
       not checked for newly added attachments (or attachments with newly
       fixed mimetype).
    2. If neither of the eligible attachment(s) of a bug is the first
       attachment, the bug will be processed (and the attachment downloaded)
       time and again (because the shortcut is only applied for the first
       attachment, see 1).
    
    But it also ensures that if the script is killed, the download is
    restarted on the next run.
    
    Change-Id: I7f3d1922825bb314f96ec3b1ee2a0ac47604b018

diff --git a/bin/get-bugzilla-attachments-by-mimetype b/bin/get-bugzilla-attachments-by-mimetype
index 03b9f32..3957c0c 100755
--- a/bin/get-bugzilla-attachments-by-mimetype
+++ b/bin/get-bugzilla-attachments-by-mimetype
@@ -21,8 +21,11 @@
 from __future__ import print_function
 import feedparser
 import base64
+import datetime
+import glob
 import re
 import os, os.path
+import stat
 import sys
 try:
     from urllib.request import urlopen
@@ -49,130 +52,190 @@ def urlopen_retry(url):
 def get_from_bug_url_via_xml(url, mimetype, prefix, suffix):
     id = url.rsplit('=', 2)[1]
     print("id is " + prefix + id + " " + suffix)
-    if os.path.isfile(suffix + '/' + prefix + id + '-1.' + suffix):
-        print("assuming " + id + " is up to date")
-    else:
-        print("parsing " + id)
-        sock = urlopen_retry(url+"&ctype=xml")
-        dom = minidom.parse(sock)
-        sock.close()
-        attachmentid=0
-        for attachment in dom.getElementsByTagName('attachment'):
-            attachmentid += 1
-            print(" mimetype is", end=' ')
-            for node in attachment.childNodes:
-                if node.nodeName == 'type':
-                    print(node.firstChild.nodeValue, end=' ')
-                    if node.firstChild.nodeValue.lower() != mimetype.lower():
-                        print('skipping')
-                        break
-                elif node.nodeName == 'data':
-                    # check if attachment is deleted (i.e. https://bugs.kde.org/show_bug.cgi?id=53343&ctype=xml)
-                    if not node.firstChild:
-                        print('deleted attachment, skipping')
-                        continue
-
-                    download = suffix + '/' +prefix + id + '-' + str(attachmentid) + '.' + suffix
-                    print('downloading as ' + download)
-                    f = open(download, 'wb')
-                    f.write(base64.b64decode(node.firstChild.nodeValue))
-                    f.close()
+    print("parsing " + id)
+    sock = urlopen_retry(url+"&ctype=xml")
+    dom = minidom.parse(sock)
+    sock.close()
+    attachmentid=0
+    for attachment in dom.getElementsByTagName('attachment'):
+        attachmentid += 1
+        print(" mimetype is", end=' ')
+        for node in attachment.childNodes:
+            if node.nodeName == 'type':
+                print(node.firstChild.nodeValue, end=' ')
+                if node.firstChild.nodeValue.lower() != mimetype.lower():
+                    print('skipping')
                     break
+            elif node.nodeName == 'data':
+                # check if attachment is deleted (i.e. https://bugs.kde.org/show_bug.cgi?id=53343&ctype=xml)
+                if not node.firstChild:
+                    print('deleted attachment, skipping')
+                    continue
+
+                download = suffix + '/' +prefix + id + '-' + str(attachmentid) + '.' + suffix
+                if os.path.isfile(download):
+                    print("assuming " + download + " is up to date")
+                    continue
+
+                print('downloading as ' + download)
+                f = open(download, 'wb')
+                f.write(base64.b64decode(node.firstChild.nodeValue))
+                f.close()
+                break
 
 def get_novell_bug_via_xml(url, mimetype, prefix, suffix):
     id = url.rsplit('=', 2)[1]
     print("id is " + prefix + id + " " + suffix)
-    if os.path.isfile(suffix + '/' + prefix + id + '-1.' + suffix):
-        print("assuming " + id + " is up to date")
-    else:
-        print("parsing " + id)
-        sock = urlopen_retry(url+"&ctype=xml")
-        dom = minidom.parse(sock)
-        sock.close()
-        attachmentid=0
-        for comment in dom.getElementsByTagName('thetext'):
-            commentText = comment.firstChild.nodeValue
-            match = re.search(r".*Created an attachment \(id=([0-9]+)\)", commentText)
-            if not match:
-                continue
-
-            attachmentid += 1
-
-            download = suffix + '/' + prefix + id + '-' + str(attachmentid) + '.' + suffix
-            if os.path.isfile(download):
-                print("assuming " + download + " is up to date")
-                continue
-
-            realAttachmentId = match.group(1)
-            handle = urlopen_retry(novellattach + realAttachmentId)
-            if not handle:
-                print("attachment %s is not accessible" % realAttachmentId)
-                continue
-            print(" mimetype is", end=' ')
-
-            info = handle.info()
-            if info.get_content_type:
-                remoteMime = info.get_content_type()
-            else:
-                remoteMime = info.gettype()
-            print(remoteMime, end=' ')
-            if remoteMime != mimetype:
-                print("skipping")
-                continue
-
-            print('downloading as ' + download)
-            f = open(download, 'wb')
-            f.write(handle.read())
-            f.close()
+    print("parsing " + id)
+    sock = urlopen_retry(url+"&ctype=xml")
+    dom = minidom.parse(sock)
+    sock.close()
+    attachmentid=0
+    for comment in dom.getElementsByTagName('thetext'):
+        commentText = comment.firstChild.nodeValue
+        match = re.search(r".*Created an attachment \(id=([0-9]+)\)", commentText)
+        if not match:
+            continue
+
+        attachmentid += 1
+
+        download = suffix + '/' + prefix + id + '-' + str(attachmentid) + '.' + suffix
+        if os.path.isfile(download):
+            print("assuming " + download + " is up to date")
+            continue
+
+        realAttachmentId = match.group(1)
+        handle = urlopen_retry(novellattach + realAttachmentId)
+        if not handle:
+            print("attachment %s is not accessible" % realAttachmentId)
+            continue
+        print(" mimetype is", end=' ')
+
+        info = handle.info()
+        if info.get_content_type:
+            remoteMime = info.get_content_type()
+        else:
+            remoteMime = info.gettype()
+        print(remoteMime, end=' ')
+        if remoteMime != mimetype:
+            print("skipping")
+            continue
+
+        print('downloading as ' + download)
+        f = open(download, 'wb')
+        f.write(handle.read())
+        f.close()
+
+def create_query(mimetype):
+    query = dict()
+    query['query_format']='advanced'
+    query['field0-0-0']='attachments.mimetype'
+    query['type0-0-0']='equals'
+    query['value0-0-0']=mimetype
+    return query
+
+def get_downloaded_files(prefix, suffix):
+    return glob.glob(os.path.join(suffix, '%s*.%s' % (prefix, suffix)))
+
+def get_file_bz_ids(files, prefix):
+    return set([os.path.basename(f).split('-')[0].replace(prefix, '', 1) for f in files])
+
+def get_changed_date(files):
+    newest = max([os.stat(f)[stat.ST_MTIME] for f in files])
+    # Subtract a day to avoid timezone differences. The worst thing that
+    # can happen is that we are going to process more bugs than necessary.
+    return datetime.date.fromtimestamp(newest - 24 * 60 * 60)
 
 def get_through_rpc_query(rpcurl, showurl, mimetype, prefix, suffix):
     try:
         os.mkdir(suffix)
     except:
         pass
-    try:
-        proxy = xmlrpclib.ServerProxy(rpcurl)
-        query = dict()
-        query['column_list']='bug_id'
-        query['query_format']='advanced'
-        query['field0-0-0']='attachments.mimetype'
-        query['type0-0-0']='equals'
-        query['value0-0-0']=mimetype
-        result = proxy.Bug.search(query)
-        bugs = result['bugs']
-        print(str(len(bugs)) + ' bugs to process')
-        for bug in bugs:
-            url = showurl + str(bug['id'])
-            get_from_bug_url_via_xml(url, mimetype, prefix, suffix)
-    except xmlrpclib.Fault as err:
-        print("A fault occurred")
-        print("Fault code: %s" % err.faultCode)
-        print(err.faultString)
-
-def get_through_rss_query_url(url, mimetype, prefix, suffix):
+
+    def process(query, full, have=[]):
+        try:
+            proxy = xmlrpclib.ServerProxy(rpcurl)
+            result = proxy.Bug.search(query)
+            bugs = result['bugs']
+            print(str(len(bugs)) + ' bugs to process')
+
+            if full:
+                available = set([str(bug['id']) for bug in bugs])
+                # we already have files from all available bugs
+                if available.difference(set(have)) == set():
+                    print("assuming all downloaded files are up to date")
+                    return
+
+            for bug in bugs:
+                url = showurl + str(bug['id'])
+                get_from_bug_url_via_xml(url, mimetype, prefix, suffix)
+        except xmlrpclib.Fault as err:
+            print("A fault occurred")
+            print("Fault code: %s" % err.faultCode)
+            print(err.faultString)
+
+    query = create_query(mimetype)
+    query['column_list']='bug_id'
+
+    files = get_downloaded_files(prefix, suffix)
+
+    if files != []:
+        print('looking for updated bugs having %s attachment(s)' % mimetype)
+        query_changed = query.copy()
+        query_changed['field0-1-0'] = 'days_elapsed'
+        query_changed['type0-1-0'] = 'lessthaneq'
+        query_changed['value0-1-0'] = str((datetime.date.today() - get_changed_date(files)).days)
+        process(query_changed, False)
+
+    print('looking for all bugs having %s attachment(s)' % mimetype)
+    process(query, True, get_file_bz_ids(files, prefix))
+
+def get_through_rss_query(queryurl, mimetype, prefix, suffix):
     try:
         os.mkdir(suffix)
     except:
         pass
-    d = feedparser.parse(url)
 
     #Getting detailed bug information and downloading an attachment body is not possible without logging in to Novell bugzilla
     #get_novell_bug_via_xml function is a workaround for that situation
     get_bug_function = get_novell_bug_via_xml if prefix == "novell" else get_from_bug_url_via_xml
 
-    for entry in d['entries']:
-        try:
-            get_bug_function(entry['id'], mimetype, prefix, suffix)
-        except KeyboardInterrupt:
-            raise # Ctrl+C should work
-        except:
-            print(entry['id'] + " failed: " + str(sys.exc_info()[0]))
-            pass
-
-def get_through_rss_query(queryurl, mimetype, prefix, suffix):
-    url = queryurl + '?query_format=advanced&field0-0-0=attachments.mimetype&type0-0-0=equals&value0-0-0=' + escape(mimetype) + '&ctype=rss'
-    print('url is ' + url)
-    get_through_rss_query_url(url, mimetype, prefix, suffix)
+    def process(query, full, have=[]):
+        url = queryurl + '?' + '&'.join(['='.join(kv) for kv in query.iteritems()])
+        print('url is ' + url)
+        d = feedparser.parse(url)
+
+        if full:
+            available = set([str(entry['id'].split('=')[-1]) for entry in d['entries']])
+            # we already have files from all available bugs
+            if available.difference(set(have)) == set():
+                print("assuming all downloaded files are up to date")
+                return
+
+        for entry in d['entries']:
+            try:
+                get_bug_function(entry['id'], mimetype, prefix, suffix)
+            except KeyboardInterrupt:
+                raise # Ctrl+C should work
+            except:
+                print(entry['id'] + " failed: " + str(sys.exc_info()[0]))
+                pass
+
+    query = create_query(escape(mimetype))
+    query['ctype'] = 'rss'
+
+    files = get_downloaded_files(prefix, suffix)
+
+    if files != []:
+        print('looking for updated bugs having %s attachment(s)' % mimetype)
+        query_changed = query.copy()
+        query_changed['field0-1-0'] = 'changed'
+        query_changed['type0-1-0'] = 'changedbefore'
+        query_changed['value0-1-0'] = get_changed_date(files).isoformat()
+        process(query_changed, False)
+
+    print('looking for all bugs having %s attachment(s)' % mimetype)
+    process(query, True, get_file_bz_ids(files, prefix))
 
 def get_launchpad_bugs(prefix):
     #launchpadlib python module is required to download launchpad attachments