[PATCH] charsets: simplify check to see if conversion to charset possible

Tue Aug 15 21:36:43 UTC 2017

On Tue, 2017-08-15 at 23:25 +0200, Aleksander Morgado wrote:
> Instead of having a method that returns the expected length after the
> conversion and the amount of input UTF-8 characters that couldn't be
> converted to the given charset, simplify the logic and just define a
> method that returns a boolean specifying whether the conversion is
> possible or not.
> ---
> 
> Hey,
> 
> How about this one? It really is asking for unit tests, truth be
> told. Will do those next.

LGTM

> ---
>  src/mm-charsets.c      | 66 ++++++++++++++++++--------------------
> ------------
>  src/mm-charsets.h      |  7 +++---
>  src/mm-sms-part-3gpp.c |  6 +----
>  src/mm-sms-part-cdma.c |  6 +----
>  4 files changed, 29 insertions(+), 56 deletions(-)
> 
> diff --git a/src/mm-charsets.c b/src/mm-charsets.c
> index fb47b0ae..108e3ab6 100644
> --- a/src/mm-charsets.c
> +++ b/src/mm-charsets.c
> @@ -463,43 +463,37 @@ mm_charset_utf8_to_unpacked_gsm (const char
> *utf8, guint32 *out_len)
>  }
> 
>  static gboolean
> -gsm_is_subset (gunichar c, const char *utf8, gsize ulen, guint
> *out_clen)
> +gsm_is_subset (gunichar c, const char *utf8, gsize ulen)
>  {
>      guint8 gsm;
> 
> -    *out_clen = 1;
>      if (utf8_to_gsm_def_char (utf8, ulen, &gsm))
>          return TRUE;
> -    if (utf8_to_gsm_ext_char (utf8, ulen, &gsm)) {
> -        *out_clen = 2;
> +    if (utf8_to_gsm_ext_char (utf8, ulen, &gsm))
>          return TRUE;
> -    }
>      return FALSE;
>  }
> 
>  static gboolean
> -ira_is_subset (gunichar c, const char *utf8, gsize ulen, guint
> *out_clen)
> +ira_is_subset (gunichar c, const char *utf8, gsize ulen)
>  {
> -    *out_clen = 1;
>      return (ulen == 1);
>  }
> 
>  static gboolean
> -ucs2_is_subset (gunichar c, const char *utf8, gsize ulen, guint
> *out_clen)
> +ucs2_is_subset (gunichar c, const char *utf8, gsize ulen)
>  {
> -    *out_clen = 2;
>      return (c <= 0xFFFF);
>  }
> 
>  static gboolean
> -iso88591_is_subset (gunichar c, const char *utf8, gsize ulen, guint
> *out_clen)
> +iso88591_is_subset (gunichar c, const char *utf8, gsize ulen)
>  {
> -    *out_clen = 1;
>      return (c <= 0xFF);
>  }
> 
>  static gboolean
> -pccp437_is_subset (gunichar c, const char *utf8, gsize ulen, guint
> *out_clen)
> +pccp437_is_subset (gunichar c, const char *utf8, gsize ulen)
>  {
>      static const gunichar t[] = {
>          0x00c7, 0x00fc, 0x00e9, 0x00e2, 0x00e4, 0x00e0, 0x00e5,
> 0x00e7, 0x00ea,
> @@ -520,8 +514,6 @@ pccp437_is_subset (gunichar c, const char *utf8,
> gsize ulen, guint *out_clen)
>      };
>      int i;
> 
> -    *out_clen = 1;
> -
>      if (c <= 0x7F)
>          return TRUE;
>      for (i = 0; i < sizeof (t) / sizeof (t[0]); i++) {
> @@ -532,7 +524,7 @@ pccp437_is_subset (gunichar c, const char *utf8,
> gsize ulen, guint *out_clen)
>  }
> 
>  static gboolean
> -pcdn_is_subset (gunichar c, const char *utf8, gsize ulen, guint
> *out_clen)
> +pcdn_is_subset (gunichar c, const char *utf8, gsize ulen)
>  {
>      static const gunichar t[] = {
>          0x00c7, 0x00fc, 0x00e9, 0x00e2, 0x00e4, 0x00e0, 0x00e5,
> 0x00e7, 0x00ea,
> @@ -553,8 +545,6 @@ pcdn_is_subset (gunichar c, const char *utf8,
> gsize ulen, guint *out_clen)
>      };
>      int i;
> 
> -    *out_clen = 1;
> -
>      if (c <= 0x7F)
>          return TRUE;
>      for (i = 0; i < sizeof (t) / sizeof (t[0]); i++) {
> @@ -566,7 +556,7 @@ pcdn_is_subset (gunichar c, const char *utf8,
> gsize ulen, guint *out_clen)
> 
>  typedef struct {
>      MMModemCharset cs;
> -    gboolean (*func) (gunichar c, const char *utf8, gsize ulen,
> guint *out_clen);
> +    gboolean (*func) (gunichar c, const char *utf8, gsize ulen);
>      guint charsize;
>  } SubsetEntry;
> 
> @@ -581,40 +571,34 @@ SubsetEntry subset_table[] = {
>  };
> 
>  /**
> - * mm_charset_get_encoded_len:
> + * mm_charset_can_covert_to:
> + * @utf8: UTF-8 valid string.
> + * @charset: the #MMModemCharset to validate the conversion from
> @utf8.
>   *
> - * @utf8: UTF-8 valid string
> - * @charset: the #MMModemCharset to check the length of @utf8 in
> - * @out_unsupported: on return, number of characters of @utf8 that
> are not fully
> - * representable in @charset
> - *
> - * Returns: the size in bytes of the string if converted from UTF-8
> into @charset.
> - **/
> -guint
> -mm_charset_get_encoded_len (const char *utf8,
> -                            MMModemCharset charset,
> -                            guint *out_unsupported)
> + * Returns: %TRUE if the conversion is possible without errors,
> %FALSE otherwise.
> + */
> +gboolean
> +mm_charset_can_convert_to (const char *utf8,
> +                           MMModemCharset charset)
>  {
>      const char *p = utf8;
> -    guint len = 0, unsupported = 0;
>      SubsetEntry *e;
> 
> -    g_return_val_if_fail (charset != MM_MODEM_CHARSET_UNKNOWN, 0);
> -    g_return_val_if_fail (utf8 != NULL, 0);
> +    g_return_val_if_fail (charset != MM_MODEM_CHARSET_UNKNOWN,
> FALSE);
> +    g_return_val_if_fail (utf8 != NULL, FALSE);
> 
>      if (charset == MM_MODEM_CHARSET_UTF8)
> -        return strlen (utf8);
> +        return TRUE;
> 
>      /* Find the charset in our subset table */
>      for (e = &subset_table[0];
>           e->cs != charset && e->cs != MM_MODEM_CHARSET_UNKNOWN;
>           e++);
> -    g_return_val_if_fail (e->cs != MM_MODEM_CHARSET_UNKNOWN, 0);
> +    g_return_val_if_fail (e->cs != MM_MODEM_CHARSET_UNKNOWN, FALSE);
> 
>      while (*p) {
>          gunichar c;
>          const char *end;
> -        guint clen = 0;
> 
>          c = g_utf8_get_char_validated (p, -1);
>          g_return_val_if_fail (c != (gunichar) -1, 0);
> @@ -625,15 +609,13 @@ mm_charset_get_encoded_len (const char *utf8,
>              while (*++end);
>          }
> 
> -        if (!e->func (c, p, (end - p), &clen))
> -            unsupported++;
> -        len += clen;
> +        if (!e->func (c, p, (end - p)))
> +            return FALSE;
> +
>          p = end;
>      }
> 
> -    if (out_unsupported)
> -        *out_unsupported = unsupported;
> -    return len;
> +    return TRUE;
>  }
> 
>  guint8 *
> diff --git a/src/mm-charsets.h b/src/mm-charsets.h
> index a85f3dc7..f738fa34 100644
> --- a/src/mm-charsets.h
> +++ b/src/mm-charsets.h
> @@ -57,10 +57,9 @@ guint8 *mm_charset_utf8_to_unpacked_gsm (const
> char *utf8, guint32 *out_len);
> 
>  guint8 *mm_charset_gsm_unpacked_to_utf8 (const guint8 *gsm, guint32
> len);
> 
> -/* Returns the size in bytes required to hold the UTF-8 string in
> the given charset */
> -guint mm_charset_get_encoded_len (const char *utf8,
> -                                  MMModemCharset charset,
> -                                  guint *out_unsupported);
> +/* Checks whether conversion to the given charset may be done
> without errors */
> +gboolean mm_charset_can_convert_to (const char *utf8,
> +                                    MMModemCharset charset);
> 
>  guint8 *gsm_unpack (const guint8 *gsm,
>                      guint32 num_septets,
> diff --git a/src/mm-sms-part-3gpp.c b/src/mm-sms-part-3gpp.c
> index 3f4d6c06..c413a4d0 100644
> --- a/src/mm-sms-part-3gpp.c
> +++ b/src/mm-sms-part-3gpp.c
> @@ -1026,7 +1026,6 @@ gchar **
>  mm_sms_part_3gpp_util_split_text (const gchar *text,
>                                    MMSmsEncoding *encoding)
>  {
> -    guint gsm_unsupported = 0;
>      gchar **out;
>      guint n_chunks;
>      guint i;
> @@ -1058,10 +1057,7 @@ mm_sms_part_3gpp_util_split_text (const gchar
> *text,
>       */
> 
>      /* Check if we can do GSM encoding */
> -    mm_charset_get_encoded_len (text,
> -                                MM_MODEM_CHARSET_GSM,
> -                                &gsm_unsupported);
> -    if (gsm_unsupported > 0) {
> +    if (!mm_charset_can_convert_to (text, MM_MODEM_CHARSET_GSM)) {
>          /* If cannot do it in GSM encoding, do it in UCS-2 */
>          GByteArray *array;
> 
> diff --git a/src/mm-sms-part-cdma.c b/src/mm-sms-part-cdma.c
> index 8d76bcec..167eda83 100644
> --- a/src/mm-sms-part-cdma.c
> +++ b/src/mm-sms-part-cdma.c
> @@ -1365,7 +1365,6 @@ decide_best_encoding (const gchar *text,
>                        guint *num_bits_per_field,
>                        Encoding *encoding)
>  {
> -    guint latin_unsupported = 0;
>      guint ascii_unsupported = 0;
>      guint i;
>      guint len;
> @@ -1391,10 +1390,7 @@ decide_best_encoding (const gchar *text,
>      }
> 
>      /* Check if we can do Latin encoding */
> -    mm_charset_get_encoded_len (text,
> -                                MM_MODEM_CHARSET_8859_1,
> -                                &latin_unsupported);
> -    if (!latin_unsupported) {
> +    if (mm_charset_can_convert_to (text, MM_MODEM_CHARSET_8859_1)) {
>          *out = g_byte_array_sized_new (len);
>          mm_modem_charset_byte_array_append (*out,
>                                              text,
> --
> 2.13.1