[Libreoffice-commits] core.git: 4 commits - bin/get-bugzilla-attachments-by-mimetype

Thu Jun 4 12:56:34 PDT 2015

bin/get-bugzilla-attachments-by-mimetype |   80 +++++++++++++++++--------------
 1 file changed, 45 insertions(+), 35 deletions(-)

New commits:
commit 56763e94bf6f59dde3e33e522553eb39b77e81a2
Author: Michael Stahl <mstahl at redhat.com>
Date:   Thu Jun 4 16:57:59 2015 +0200

    get-bugzilla-attachments: actually use the fdo bugzilla
    
    bugs.libreoffice.org redirects to bugs.documentfoundation.org,
    which isn't very helpful as it just causes duplicate downloads.
    
    Arguably freedesktop.org could be removed; the are just ~5 interesing
    attachments since the TDF bugzilla migration.
    
    Change-Id: I26d2667848582209e382226108c47549e99cee97

diff --git a/bin/get-bugzilla-attachments-by-mimetype b/bin/get-bugzilla-attachments-by-mimetype
index a5f1570..7f0dfa2 100755
--- a/bin/get-bugzilla-attachments-by-mimetype
+++ b/bin/get-bugzilla-attachments-by-mimetype
@@ -339,7 +339,7 @@ def get_launchpad_bugs(prefix):
 
 rss_bugzillas = (
     ( 'abi', 'http://bugzilla.abisource.com/buglist.cgi' ), #added for abiword
-    ( 'fdo', 'http://bugs.libreoffice.org/buglist.cgi' ),
+    ( 'fdo', 'http://bugs.freedesktop.org/buglist.cgi' ),
     ( 'gentoo', 'http://bugs.gentoo.org/buglist.cgi' ),
     ( 'gnome', 'http://bugzilla.gnome.org/buglist.cgi' ), # added for gnumeric
     ( 'kde', 'http://bugs.kde.org/buglist.cgi' ), # added for koffice/calligra
commit e5f9ee18476697a64d7ef646a072f8c76cf95b50
Author: Michael Stahl <mstahl at redhat.com>
Date:   Thu Jun 4 12:56:35 2015 +0200

    get-bugzilla-attachments: avoid writing incomplete files
    
    Change-Id: I7d1139ddf8c88626dd716aa537a305c31b5be5d9

diff --git a/bin/get-bugzilla-attachments-by-mimetype b/bin/get-bugzilla-attachments-by-mimetype
index fbc4031..a5f1570 100755
--- a/bin/get-bugzilla-attachments-by-mimetype
+++ b/bin/get-bugzilla-attachments-by-mimetype
@@ -94,9 +94,11 @@ def get_from_bug_url_via_xml(url, mimetype, prefix, suffix):
                         continue
 
                 print('downloading as ' + download)
-                f = open(download, 'wb')
+                tmpfile = download + ".tmp"
+                f = open(tmpfile, 'wb')
                 f.write(base64.b64decode(node.firstChild.nodeValue))
                 f.close()
+                os.rename(tmpfile, download)
                 break
 
 def get_novell_bug_via_xml(url, mimetype, prefix, suffix):
@@ -138,9 +140,11 @@ def get_novell_bug_via_xml(url, mimetype, prefix, suffix):
             continue
 
         print('downloading as ' + download)
-        f = open(download, 'wb')
+        tmpfile = download + ".tmp"
+        f = open(tmpfile, 'wb')
         f.write(handle.read())
         f.close()
+        os.rename(tmpfile, download)
 
 def create_query(mimetype):
     query = dict()
@@ -327,9 +331,11 @@ def get_launchpad_bugs(prefix):
 
                 print('mimetype is ' + handle.content_type + ' downloading as ' + download)
 
-                f = open(download, "w")
+                tmpfile = download + ".tmp"
+                f = open(tmpfile, "wb")
                 f.write(handle.read())
                 f.close()
+                os.rename(tmpfile, download)
 
 rss_bugzillas = (
     ( 'abi', 'http://bugzilla.abisource.com/buglist.cgi' ), #added for abiword
commit bd2eee0bd4ae83ff453522b7cf09b69f1b8b5e1b
Author: Michael Stahl <mstahl at redhat.com>
Date:   Wed Jun 3 23:41:32 2015 +0200

    get-bugzilla-attachments: avoid FDO-TDF duplicates...
    
    ... by checking that a file with "fdo" already exists for bugs older
    than the migration, instead of just ignoring the old bugs on TDF.
    
    There are > 300 additional attachments not on freedesktop.org.
    
    Change-Id: Ib7ee63041109071cc1241a875ef2cccbddfc699d

diff --git a/bin/get-bugzilla-attachments-by-mimetype b/bin/get-bugzilla-attachments-by-mimetype
index 7e6dc83e..fbc4031 100755
--- a/bin/get-bugzilla-attachments-by-mimetype
+++ b/bin/get-bugzilla-attachments-by-mimetype
@@ -86,6 +86,13 @@ def get_from_bug_url_via_xml(url, mimetype, prefix, suffix):
                     print("assuming " + download + " is up to date")
                     continue
 
+                # prevent re-downloading FDO attachments from TDF
+                if prefix == "tdf" and int(id) < 88776:
+                    fdodownload = download.replace("tdf", "fdo")
+                    if os.path.isfile(fdodownload):
+                        print("assuming FDO " + fdodownload + " is up to date")
+                        continue
+
                 print('downloading as ' + download)
                 f = open(download, 'wb')
                 f.write(base64.b64decode(node.firstChild.nodeValue))
@@ -199,7 +206,7 @@ def get_through_rpc_query(rpcurl, showurl, mimetype, prefix, suffix):
     print('looking for all bugs having %s attachment(s)' % mimetype)
     process(query, True, get_file_bz_ids(files, prefix))
 
-def get_through_rss_query(queryurl, mimetype, prefix, suffix, startid):
+def get_through_rss_query(queryurl, mimetype, prefix, suffix):
     try:
         os.mkdir(suffix)
     except:
@@ -218,10 +225,7 @@ def get_through_rss_query(queryurl, mimetype, prefix, suffix, startid):
         entries = []
         for entry in d['entries']:
             bugid = entry['id'].split('=')[-1]
-            if (int(bugid) >= startid):
-                entries.append(entry)
-            else:
-                print("Dropping " + bugid + " because < startid of " + str(startid))
+            entries.append(entry)
 
         if full:
             available = set([str(entry['id'].split('=')[-1]) for entry in entries])
@@ -328,20 +332,20 @@ def get_launchpad_bugs(prefix):
                 f.close()
 
 rss_bugzillas = (
-    ( 'abi', 'http://bugzilla.abisource.com/buglist.cgi', 0 ), #added for abiword
-    ( 'fdo', 'http://bugs.libreoffice.org/buglist.cgi', 0 ),
-    ( 'gentoo', 'http://bugs.gentoo.org/buglist.cgi', 0 ),
-    ( 'gnome', 'http://bugzilla.gnome.org/buglist.cgi', 0 ), # added for gnumeric
-    ( 'kde', 'http://bugs.kde.org/buglist.cgi', 0 ), # added for koffice/calligra
-    ( 'mandriva', 'https://qa.mandriva.com/buglist.cgi', 0 ),
-    ( 'moz', 'https://bugzilla.mozilla.org/buglist.cgi', 0 ),
+    ( 'abi', 'http://bugzilla.abisource.com/buglist.cgi' ), #added for abiword
+    ( 'fdo', 'http://bugs.libreoffice.org/buglist.cgi' ),
+    ( 'gentoo', 'http://bugs.gentoo.org/buglist.cgi' ),
+    ( 'gnome', 'http://bugzilla.gnome.org/buglist.cgi' ), # added for gnumeric
+    ( 'kde', 'http://bugs.kde.org/buglist.cgi' ), # added for koffice/calligra
+    ( 'mandriva', 'https://qa.mandriva.com/buglist.cgi' ),
+    ( 'moz', 'https://bugzilla.mozilla.org/buglist.cgi' ),
     # It seems something has changed and it is no longer possible to
     # download any files from there.
     # NOTE: I am leaving it in the list, commented out, just so someone
     # does not add it back immediately .-)
     # 'novell': 'https://bugzilla.novell.com/buglist.cgi',
-    ( 'ooo', 'https://bz.apache.org/ooo/buglist.cgi', 0 ),
-    ( 'tdf', 'http://bugs.documentfoundation.org/buglist.cgi', 88776 ),
+    ( 'ooo', 'https://bz.apache.org/ooo/buglist.cgi' ),
+    ( 'tdf', 'http://bugs.documentfoundation.org/buglist.cgi' ),
 )
 
 redhatrpc = 'https://bugzilla.redhat.com/xmlrpc.cgi'
@@ -497,9 +501,9 @@ class manage_threads(threading.Thread):
                 # Get job from queue
                 # Use job parameters to call our query
                 # Then let the queue know we are done with this job
-                (uri, mimetype, prefix, extension, startid) = jobs.get(True,6)
+                (uri, mimetype, prefix, extension) = jobs.get(True,6)
                 try:
-                    get_through_rss_query(uri, mimetype, prefix, extension, startid)
+                    get_through_rss_query(uri, mimetype, prefix, extension)
                 finally:
                     jobs.task_done()
             except KeyboardInterrupt:
@@ -508,7 +512,7 @@ class manage_threads(threading.Thread):
                 break
 
 def generate_multi_threading():
-    for (prefix, uri, startid) in rss_bugzillas:
+    for (prefix, uri) in rss_bugzillas:
 
         # Initialize threads
         for i in range(max_threads):
@@ -522,7 +526,7 @@ def generate_multi_threading():
             if mimetype == 'text/html' and prefix == 'moz':
                     continue
 
-            jobs.put([uri, mimetype, prefix, extension, startid], block=True)
+            jobs.put([uri, mimetype, prefix, extension], block=True)
             print("successfully placed a job in the queue searching for " + mimetype + " in bugtracker " + prefix)
 
         # Continue when all mimetypes are done for a bugzilla
commit 0cfe2c8c893bfe6d1c2dce5941065eb4e841e7cc
Author: Michael Stahl <mstahl at redhat.com>
Date:   Wed Jun 3 12:14:31 2015 +0200

    get-bugzilla-attachments: fix the multi-threading
    
    The queue was limited to an arbitrary maximum size, causing half of the
    jobs to be droppend on the floor.  Also it didn't run on Python 3.
    
    Change-Id: I90bfba448291d901c5a7c83389d17c6acdd919c8

diff --git a/bin/get-bugzilla-attachments-by-mimetype b/bin/get-bugzilla-attachments-by-mimetype
index 9ae182c..7e6dc83e 100755
--- a/bin/get-bugzilla-attachments-by-mimetype
+++ b/bin/get-bugzilla-attachments-by-mimetype
@@ -31,7 +31,11 @@ import re
 import os, os.path
 import stat
 import sys
-import threading, Queue
+import threading
+try:
+    import queue
+except:
+    import Queue as queue
 try:
     from urllib.request import urlopen
 except:
@@ -206,7 +210,7 @@ def get_through_rss_query(queryurl, mimetype, prefix, suffix, startid):
     get_bug_function = get_novell_bug_via_xml if prefix == "novell" else get_from_bug_url_via_xml
 
     def process(query, full, have=[]):
-        url = queryurl + '?' + '&'.join(['='.join(kv) for kv in query.iteritems()])
+        url = queryurl + '?' + '&'.join(['='.join(kv) for kv in query.items()])
         print('url is ' + url)
         d = feedparser.parse(url)
         print(str(len(d['entries'])) + ' bugs to process')
@@ -493,44 +497,40 @@ class manage_threads(threading.Thread):
                 # Get job from queue
                 # Use job parameters to call our query
                 # Then let the queue know we are done with this job
-                job = jobs.get(True,6)
-                get_through_rss_query(job[0], job[1], job[2], job[3], job[4]) # [0] = uri; [1] = mimetype; [2] = prefix; [3] = extension; [4] = startid
-                jobs.task_done()
+                (uri, mimetype, prefix, extension, startid) = jobs.get(True,6)
+                try:
+                    get_through_rss_query(uri, mimetype, prefix, extension, startid)
+                finally:
+                    jobs.task_done()
             except KeyboardInterrupt:
                 raise # Ctrl+C should work
-            except:
+            except queue.Empty:
                 break
 
 def generate_multi_threading():
     for (prefix, uri, startid) in rss_bugzillas:
 
         # Initialize threads
-        for i in xrange(max_threads):
+        for i in range(max_threads):
             manage_threads().start()
 
         # Create a job for every mimetype for a bugzilla
         for (mimetype,extension) in mimetypes.items():
-
-
             # It seems that bugzilla has problems returing that many results
             # (10000 results is probably a limit set somewhere) so we always
             # end processing the complete list.
             if mimetype == 'text/html' and prefix == 'moz':
                     continue
 
-            try:
-                jobs.put([uri, mimetype, prefix, extension, startid], block=True, timeout=3)
-                print("successfully placed a job in the queue searching for " + mimetype + " in bugtracker " + prefix)
-            except KeyboardInterrupt:
-                raise # Ctrl+C should work
-            except:
-                print("Queue full")
+            jobs.put([uri, mimetype, prefix, extension, startid], block=True)
+            print("successfully placed a job in the queue searching for " + mimetype + " in bugtracker " + prefix)
 
         # Continue when all mimetypes are done for a bugzilla
         jobs.join()
+        print("DONE with bugtracker " + prefix)
 
 max_threads = 20 # Number of threads to create, (1 = without multi-threading)
-jobs = Queue.Queue(40)
+jobs = queue.Queue()
 
 generate_multi_threading()