[packagekit] Backend unicode improvements

Luke Macken lmacken at redhat.com
Sun Mar 16 15:37:14 PDT 2008


Hey guys,

So I've made an attempt to clean up the unicode mess in the yum backend.
Some issues that I noticed with the current code:
    
    _toUTF behavior is inconsistent.  If you give it a unicode string,
    it returns a unicode string.  If you give it a byte string, it
    returns a byte string.  I assume we want to always return a unicode
    object?
    
    We catch UnicodeDecodeErrors in multiple places.  These should never
    happen, yet they currently do.

Ideally, we should be following the 3 golden rules for handling unicode
in Python:

    - decode (str->unicode) early
    - unicode everywhere
    - encode (unicode->str) late

To comply with the first rule, we should probably be decoding our
bytestrings to unicode in each helper script.  Since yum2 is in the
works, it's probably not worth the effort to modify all of them, so my
patch simply decodes in a couple of places in the yumBackend.

There is still a unicode hack or two lying around due to yum/rpmdb not
handling unicode properly.  The comments suggest that these fixes will be going
upstream to yum, so hopefully those are/will be underway.

Attached is patch that attempts to cleans up a lot of the unicode mess in the
yum/yum2 backends.

I've tested it locally with the issues that were raised on this list
in the past, and haven't seen any regressions.  I'd be glad to commit
this patch, but would really appreciate some extra testing.  I'm
currently at PyCon, and my rawhide vm seems to be on the fritz, so I
haven't been able to test it with the yum2 backend, or with
gnome-packagekit.

Part of this patch pulls the stdout utf8 writer codec out from the yumBackend
into the packagekit.backend.py, so it will probably effect other
backends as well.

Cheers!

luke
-------------- next part --------------
>From 3a46336800350f2143e7ef0ba55a54898014a43c Mon Sep 17 00:00:00 2001
From: Luke Macken <lmacken at redhat.com>
Date: Sun, 16 Mar 2008 16:31:06 -0400
Subject: [PATCH] Backend unicode handling improvements

- Set sys.stdout to the utf-8 codec writer in the backend.py instead of yumBackend.py
- Rename _toUTF to _to_unicode, to remove ambiguity
- Make _to_unicode do the Right Thing
- Don't try to catch UnicodeDecodeErrors.  These should never happen.

diff --git a/backends/yum/helpers/yumBackend.py b/backends/yum/helpers/yumBackend.py
index 3c3df77..cba7c5a 100644
--- a/backends/yum/helpers/yumBackend.py
+++ b/backends/yum/helpers/yumBackend.py
@@ -220,8 +220,6 @@ class PackageKitYumBackend(PackageKitBaseBackend):
               "glibc", "hal", "dbus", "xen")
 
     def __init__(self,args,lock=True):
-        import codecs
-        sys.stdout = codecs.getwriter('utf-8')(sys.stdout)
         signal.signal(signal.SIGQUIT, sigquit)
         PackageKitBaseBackend.__init__(self,args)
         self.yumbase = PackageKitYumBase()
@@ -241,12 +239,8 @@ class PackageKitYumBackend(PackageKitBaseBackend):
         @param bytes: The size of the package, in bytes
         convert the description to UTF before sending
         '''
-        desc = self._toUTF(desc)
-        try:
-            PackageKitBaseBackend.description(self,id,license,group,desc,url,bytes)            
-        except UnicodeDecodeError,e:
-            desc = repr(desc)[1:-1]
-            PackageKitBaseBackend.description(self,id,license,group,desc,url,bytes)
+        desc = self._to_unicode(desc)
+        PackageKitBaseBackend.description(self,id,license,group,desc,url,bytes)
 
     def package(self,id,status,summary):
         '''
@@ -256,23 +250,14 @@ class PackageKitYumBackend(PackageKitBaseBackend):
         @param summary: The package Summary
         convert the summary to UTF before sending
         '''
-        summary = self._toUTF(summary)
-        try:
-            PackageKitBaseBackend.package(self,id,status,summary)
-        except UnicodeDecodeError,e:
-            summary = repr(summary)[1:-1]
-            PackageKitBaseBackend.package(self,id,status,summary)
-
-    def _toUTF( self, txt ):
-        rc=""
-        if isinstance(txt,types.UnicodeType):
-            return txt
-        else:
-            try:
-                rc = unicode( txt, 'utf-8' )
-            except UnicodeDecodeError, e:
-                rc = unicode( txt, 'iso-8859-1' )
-            return rc.encode('utf-8')
+        summary = self._to_unicode(summary)
+        PackageKitBaseBackend.package(self,id,status,summary)
+
+    def _to_unicode(self, txt, encoding='utf-8'):
+        if isinstance(txt, basestring):
+            if not isinstance(txt, unicode):
+                txt = unicode(txt, encoding)
+        return txt
 
     def doLock(self):
         ''' Lock Yum'''
diff --git a/backends/yum2/helpers/yumDBUSBackend.py b/backends/yum2/helpers/yumDBUSBackend.py
index c9642ec..a6aff58 100755
--- a/backends/yum2/helpers/yumDBUSBackend.py
+++ b/backends/yum2/helpers/yumDBUSBackend.py
@@ -264,7 +264,7 @@ class PackageKitYumBackend(PackageKitBaseBackend):
         convert the summary to UTF before sending
         '''
         id = self._pkg_to_id(pkg)
-        summary = self._toUTF(pkg.summary)
+        summary = self._to_unicode(pkg.summary)
         self.Package(status,id,summary)
 
     def _show_description(self,id,license,group,desc,url,bytes):
@@ -278,23 +278,18 @@ class PackageKitYumBackend(PackageKitBaseBackend):
         @param bytes: The size of the package, in bytes
         convert the description to UTF before sending
         '''
-        desc = self._toUTF(desc)
+        desc = self._to_unicode(desc)
         self.Description(id,license,group,desc,url,bytes)
 
 #
 # Utility methods for Signals
 #
 
-    def _toUTF( self, txt ):
-        rc=""
-        if isinstance(txt,types.UnicodeType):
-            return txt
-        else:
-            try:
-                rc = unicode( txt, 'utf-8' )
-            except UnicodeDecodeError, e:
-                rc = unicode( txt, 'iso-8859-1' )
-            return rc.encode('utf-8')
+    def _to_unicode(self, txt, encoding='utf-8'):
+        if isinstance(txt, basestring):
+            if not isinstance(txt, unicode):
+                txt = unicode(txt, encoding):
+        return txt
 
     def _pkg_to_id(self,pkg):
         pkgver = self._get_package_ver(pkg)
diff --git a/python/packagekit/backend.py b/python/packagekit/backend.py
index b27890e..dc0f835 100644
--- a/python/packagekit/backend.py
+++ b/python/packagekit/backend.py
@@ -22,8 +22,12 @@
 
 # imports
 import sys
+import codecs
 import traceback
-import types
+import locale
+
+sys.stdout = codecs.getwriter('utf-8')(sys.stdout)
+
 from enums import *
 
 # Classes
-- 
1.5.4.3



More information about the PackageKit mailing list