[PATCH] charsets: simplify check to see if conversion to charset possible

Wed Aug 16 14:28:51 UTC 2017

On Tue, Aug 15, 2017 at 11:36 PM, Dan Williams <dcbw at redhat.com> wrote:
> On Tue, 2017-08-15 at 23:25 +0200, Aleksander Morgado wrote:
>> Instead of having a method that returns the expected length after the
>> conversion and the amount of input UTF-8 characters that couldn't be
>> converted to the given charset, simplify the logic and just define a
>> method that returns a boolean specifying whether the conversion is
>> possible or not.
>> ---
>>
>> Hey,
>>
>> How about this one? It really is asking for unit tests, truth be
>> told. Will do those next.
>
> LGTM
>

I've pushed this commit with unit tests included, see:
https://cgit.freedesktop.org/ModemManager/ModemManager/commit/?id=5a01e5406660bc4f5a3cf68254510e867f89a457

>> ---
>>  src/mm-charsets.c      | 66 ++++++++++++++++++--------------------
>> ------------
>>  src/mm-charsets.h      |  7 +++---
>>  src/mm-sms-part-3gpp.c |  6 +----
>>  src/mm-sms-part-cdma.c |  6 +----
>>  4 files changed, 29 insertions(+), 56 deletions(-)
>>
>> diff --git a/src/mm-charsets.c b/src/mm-charsets.c
>> index fb47b0ae..108e3ab6 100644
>> --- a/src/mm-charsets.c
>> +++ b/src/mm-charsets.c
>> @@ -463,43 +463,37 @@ mm_charset_utf8_to_unpacked_gsm (const char
>> *utf8, guint32 *out_len)
>>  }
>>
>>  static gboolean
>> -gsm_is_subset (gunichar c, const char *utf8, gsize ulen, guint
>> *out_clen)
>> +gsm_is_subset (gunichar c, const char *utf8, gsize ulen)
>>  {
>>      guint8 gsm;
>>
>> -    *out_clen = 1;
>>      if (utf8_to_gsm_def_char (utf8, ulen, &gsm))
>>          return TRUE;
>> -    if (utf8_to_gsm_ext_char (utf8, ulen, &gsm)) {
>> -        *out_clen = 2;
>> +    if (utf8_to_gsm_ext_char (utf8, ulen, &gsm))
>>          return TRUE;
>> -    }
>>      return FALSE;
>>  }
>>
>>  static gboolean
>> -ira_is_subset (gunichar c, const char *utf8, gsize ulen, guint
>> *out_clen)
>> +ira_is_subset (gunichar c, const char *utf8, gsize ulen)
>>  {
>> -    *out_clen = 1;
>>      return (ulen == 1);
>>  }
>>
>>  static gboolean
>> -ucs2_is_subset (gunichar c, const char *utf8, gsize ulen, guint
>> *out_clen)
>> +ucs2_is_subset (gunichar c, const char *utf8, gsize ulen)
>>  {
>> -    *out_clen = 2;
>>      return (c <= 0xFFFF);
>>  }
>>
>>  static gboolean
>> -iso88591_is_subset (gunichar c, const char *utf8, gsize ulen, guint
>> *out_clen)
>> +iso88591_is_subset (gunichar c, const char *utf8, gsize ulen)
>>  {
>> -    *out_clen = 1;
>>      return (c <= 0xFF);
>>  }
>>
>>  static gboolean
>> -pccp437_is_subset (gunichar c, const char *utf8, gsize ulen, guint
>> *out_clen)
>> +pccp437_is_subset (gunichar c, const char *utf8, gsize ulen)
>>  {
>>      static const gunichar t[] = {
>>          0x00c7, 0x00fc, 0x00e9, 0x00e2, 0x00e4, 0x00e0, 0x00e5,
>> 0x00e7, 0x00ea,
>> @@ -520,8 +514,6 @@ pccp437_is_subset (gunichar c, const char *utf8,
>> gsize ulen, guint *out_clen)
>>      };
>>      int i;
>>
>> -    *out_clen = 1;
>> -
>>      if (c <= 0x7F)
>>          return TRUE;
>>      for (i = 0; i < sizeof (t) / sizeof (t[0]); i++) {
>> @@ -532,7 +524,7 @@ pccp437_is_subset (gunichar c, const char *utf8,
>> gsize ulen, guint *out_clen)
>>  }
>>
>>  static gboolean
>> -pcdn_is_subset (gunichar c, const char *utf8, gsize ulen, guint
>> *out_clen)
>> +pcdn_is_subset (gunichar c, const char *utf8, gsize ulen)
>>  {
>>      static const gunichar t[] = {
>>          0x00c7, 0x00fc, 0x00e9, 0x00e2, 0x00e4, 0x00e0, 0x00e5,
>> 0x00e7, 0x00ea,
>> @@ -553,8 +545,6 @@ pcdn_is_subset (gunichar c, const char *utf8,
>> gsize ulen, guint *out_clen)
>>      };
>>      int i;
>>
>> -    *out_clen = 1;
>> -
>>      if (c <= 0x7F)
>>          return TRUE;
>>      for (i = 0; i < sizeof (t) / sizeof (t[0]); i++) {
>> @@ -566,7 +556,7 @@ pcdn_is_subset (gunichar c, const char *utf8,
>> gsize ulen, guint *out_clen)
>>
>>  typedef struct {
>>      MMModemCharset cs;
>> -    gboolean (*func) (gunichar c, const char *utf8, gsize ulen,
>> guint *out_clen);
>> +    gboolean (*func) (gunichar c, const char *utf8, gsize ulen);
>>      guint charsize;
>>  } SubsetEntry;
>>
>> @@ -581,40 +571,34 @@ SubsetEntry subset_table[] = {
>>  };
>>
>>  /**
>> - * mm_charset_get_encoded_len:
>> + * mm_charset_can_covert_to:
>> + * @utf8: UTF-8 valid string.
>> + * @charset: the #MMModemCharset to validate the conversion from
>> @utf8.
>>   *
>> - * @utf8: UTF-8 valid string
>> - * @charset: the #MMModemCharset to check the length of @utf8 in
>> - * @out_unsupported: on return, number of characters of @utf8 that
>> are not fully
>> - * representable in @charset
>> - *
>> - * Returns: the size in bytes of the string if converted from UTF-8
>> into @charset.
>> - **/
>> -guint
>> -mm_charset_get_encoded_len (const char *utf8,
>> -                            MMModemCharset charset,
>> -                            guint *out_unsupported)
>> + * Returns: %TRUE if the conversion is possible without errors,
>> %FALSE otherwise.
>> + */
>> +gboolean
>> +mm_charset_can_convert_to (const char *utf8,
>> +                           MMModemCharset charset)
>>  {
>>      const char *p = utf8;
>> -    guint len = 0, unsupported = 0;
>>      SubsetEntry *e;
>>
>> -    g_return_val_if_fail (charset != MM_MODEM_CHARSET_UNKNOWN, 0);
>> -    g_return_val_if_fail (utf8 != NULL, 0);
>> +    g_return_val_if_fail (charset != MM_MODEM_CHARSET_UNKNOWN,
>> FALSE);
>> +    g_return_val_if_fail (utf8 != NULL, FALSE);
>>
>>      if (charset == MM_MODEM_CHARSET_UTF8)
>> -        return strlen (utf8);
>> +        return TRUE;
>>
>>      /* Find the charset in our subset table */
>>      for (e = &subset_table[0];
>>           e->cs != charset && e->cs != MM_MODEM_CHARSET_UNKNOWN;
>>           e++);
>> -    g_return_val_if_fail (e->cs != MM_MODEM_CHARSET_UNKNOWN, 0);
>> +    g_return_val_if_fail (e->cs != MM_MODEM_CHARSET_UNKNOWN, FALSE);
>>
>>      while (*p) {
>>          gunichar c;
>>          const char *end;
>> -        guint clen = 0;
>>
>>          c = g_utf8_get_char_validated (p, -1);
>>          g_return_val_if_fail (c != (gunichar) -1, 0);
>> @@ -625,15 +609,13 @@ mm_charset_get_encoded_len (const char *utf8,
>>              while (*++end);
>>          }
>>
>> -        if (!e->func (c, p, (end - p), &clen))
>> -            unsupported++;
>> -        len += clen;
>> +        if (!e->func (c, p, (end - p)))
>> +            return FALSE;
>> +
>>          p = end;
>>      }
>>
>> -    if (out_unsupported)
>> -        *out_unsupported = unsupported;
>> -    return len;
>> +    return TRUE;
>>  }
>>
>>  guint8 *
>> diff --git a/src/mm-charsets.h b/src/mm-charsets.h
>> index a85f3dc7..f738fa34 100644
>> --- a/src/mm-charsets.h
>> +++ b/src/mm-charsets.h
>> @@ -57,10 +57,9 @@ guint8 *mm_charset_utf8_to_unpacked_gsm (const
>> char *utf8, guint32 *out_len);
>>
>>  guint8 *mm_charset_gsm_unpacked_to_utf8 (const guint8 *gsm, guint32
>> len);
>>
>> -/* Returns the size in bytes required to hold the UTF-8 string in
>> the given charset */
>> -guint mm_charset_get_encoded_len (const char *utf8,
>> -                                  MMModemCharset charset,
>> -                                  guint *out_unsupported);
>> +/* Checks whether conversion to the given charset may be done
>> without errors */
>> +gboolean mm_charset_can_convert_to (const char *utf8,
>> +                                    MMModemCharset charset);
>>
>>  guint8 *gsm_unpack (const guint8 *gsm,
>>                      guint32 num_septets,
>> diff --git a/src/mm-sms-part-3gpp.c b/src/mm-sms-part-3gpp.c
>> index 3f4d6c06..c413a4d0 100644
>> --- a/src/mm-sms-part-3gpp.c
>> +++ b/src/mm-sms-part-3gpp.c
>> @@ -1026,7 +1026,6 @@ gchar **
>>  mm_sms_part_3gpp_util_split_text (const gchar *text,
>>                                    MMSmsEncoding *encoding)
>>  {
>> -    guint gsm_unsupported = 0;
>>      gchar **out;
>>      guint n_chunks;
>>      guint i;
>> @@ -1058,10 +1057,7 @@ mm_sms_part_3gpp_util_split_text (const gchar
>> *text,
>>       */
>>
>>      /* Check if we can do GSM encoding */
>> -    mm_charset_get_encoded_len (text,
>> -                                MM_MODEM_CHARSET_GSM,
>> -                                &gsm_unsupported);
>> -    if (gsm_unsupported > 0) {
>> +    if (!mm_charset_can_convert_to (text, MM_MODEM_CHARSET_GSM)) {
>>          /* If cannot do it in GSM encoding, do it in UCS-2 */
>>          GByteArray *array;
>>
>> diff --git a/src/mm-sms-part-cdma.c b/src/mm-sms-part-cdma.c
>> index 8d76bcec..167eda83 100644
>> --- a/src/mm-sms-part-cdma.c
>> +++ b/src/mm-sms-part-cdma.c
>> @@ -1365,7 +1365,6 @@ decide_best_encoding (const gchar *text,
>>                        guint *num_bits_per_field,
>>                        Encoding *encoding)
>>  {
>> -    guint latin_unsupported = 0;
>>      guint ascii_unsupported = 0;
>>      guint i;
>>      guint len;
>> @@ -1391,10 +1390,7 @@ decide_best_encoding (const gchar *text,
>>      }
>>
>>      /* Check if we can do Latin encoding */
>> -    mm_charset_get_encoded_len (text,
>> -                                MM_MODEM_CHARSET_8859_1,
>> -                                &latin_unsupported);
>> -    if (!latin_unsupported) {
>> +    if (mm_charset_can_convert_to (text, MM_MODEM_CHARSET_8859_1)) {
>>          *out = g_byte_array_sized_new (len);
>>          mm_modem_charset_byte_array_append (*out,
>>                                              text,
>> --
>> 2.13.1


-- 
Aleksander
https://aleksander.es