[PATCH] charsets: simplify check to see if conversion to charset possible
Dan Williams
dcbw at redhat.com
Tue Aug 15 21:36:43 UTC 2017
On Tue, 2017-08-15 at 23:25 +0200, Aleksander Morgado wrote:
> Instead of having a method that returns the expected length after the
> conversion and the amount of input UTF-8 characters that couldn't be
> converted to the given charset, simplify the logic and just define a
> method that returns a boolean specifying whether the conversion is
> possible or not.
> ---
>
> Hey,
>
> How about this one? It really is asking for unit tests, truth be
> told. Will do those next.
LGTM
> ---
> src/mm-charsets.c | 66 ++++++++++++++++++--------------------
> ------------
> src/mm-charsets.h | 7 +++---
> src/mm-sms-part-3gpp.c | 6 +----
> src/mm-sms-part-cdma.c | 6 +----
> 4 files changed, 29 insertions(+), 56 deletions(-)
>
> diff --git a/src/mm-charsets.c b/src/mm-charsets.c
> index fb47b0ae..108e3ab6 100644
> --- a/src/mm-charsets.c
> +++ b/src/mm-charsets.c
> @@ -463,43 +463,37 @@ mm_charset_utf8_to_unpacked_gsm (const char
> *utf8, guint32 *out_len)
> }
>
> static gboolean
> -gsm_is_subset (gunichar c, const char *utf8, gsize ulen, guint
> *out_clen)
> +gsm_is_subset (gunichar c, const char *utf8, gsize ulen)
> {
> guint8 gsm;
>
> - *out_clen = 1;
> if (utf8_to_gsm_def_char (utf8, ulen, &gsm))
> return TRUE;
> - if (utf8_to_gsm_ext_char (utf8, ulen, &gsm)) {
> - *out_clen = 2;
> + if (utf8_to_gsm_ext_char (utf8, ulen, &gsm))
> return TRUE;
> - }
> return FALSE;
> }
>
> static gboolean
> -ira_is_subset (gunichar c, const char *utf8, gsize ulen, guint
> *out_clen)
> +ira_is_subset (gunichar c, const char *utf8, gsize ulen)
> {
> - *out_clen = 1;
> return (ulen == 1);
> }
>
> static gboolean
> -ucs2_is_subset (gunichar c, const char *utf8, gsize ulen, guint
> *out_clen)
> +ucs2_is_subset (gunichar c, const char *utf8, gsize ulen)
> {
> - *out_clen = 2;
> return (c <= 0xFFFF);
> }
>
> static gboolean
> -iso88591_is_subset (gunichar c, const char *utf8, gsize ulen, guint
> *out_clen)
> +iso88591_is_subset (gunichar c, const char *utf8, gsize ulen)
> {
> - *out_clen = 1;
> return (c <= 0xFF);
> }
>
> static gboolean
> -pccp437_is_subset (gunichar c, const char *utf8, gsize ulen, guint
> *out_clen)
> +pccp437_is_subset (gunichar c, const char *utf8, gsize ulen)
> {
> static const gunichar t[] = {
> 0x00c7, 0x00fc, 0x00e9, 0x00e2, 0x00e4, 0x00e0, 0x00e5,
> 0x00e7, 0x00ea,
> @@ -520,8 +514,6 @@ pccp437_is_subset (gunichar c, const char *utf8,
> gsize ulen, guint *out_clen)
> };
> int i;
>
> - *out_clen = 1;
> -
> if (c <= 0x7F)
> return TRUE;
> for (i = 0; i < sizeof (t) / sizeof (t[0]); i++) {
> @@ -532,7 +524,7 @@ pccp437_is_subset (gunichar c, const char *utf8,
> gsize ulen, guint *out_clen)
> }
>
> static gboolean
> -pcdn_is_subset (gunichar c, const char *utf8, gsize ulen, guint
> *out_clen)
> +pcdn_is_subset (gunichar c, const char *utf8, gsize ulen)
> {
> static const gunichar t[] = {
> 0x00c7, 0x00fc, 0x00e9, 0x00e2, 0x00e4, 0x00e0, 0x00e5,
> 0x00e7, 0x00ea,
> @@ -553,8 +545,6 @@ pcdn_is_subset (gunichar c, const char *utf8,
> gsize ulen, guint *out_clen)
> };
> int i;
>
> - *out_clen = 1;
> -
> if (c <= 0x7F)
> return TRUE;
> for (i = 0; i < sizeof (t) / sizeof (t[0]); i++) {
> @@ -566,7 +556,7 @@ pcdn_is_subset (gunichar c, const char *utf8,
> gsize ulen, guint *out_clen)
>
> typedef struct {
> MMModemCharset cs;
> - gboolean (*func) (gunichar c, const char *utf8, gsize ulen,
> guint *out_clen);
> + gboolean (*func) (gunichar c, const char *utf8, gsize ulen);
> guint charsize;
> } SubsetEntry;
>
> @@ -581,40 +571,34 @@ SubsetEntry subset_table[] = {
> };
>
> /**
> - * mm_charset_get_encoded_len:
> + * mm_charset_can_covert_to:
> + * @utf8: UTF-8 valid string.
> + * @charset: the #MMModemCharset to validate the conversion from
> @utf8.
> *
> - * @utf8: UTF-8 valid string
> - * @charset: the #MMModemCharset to check the length of @utf8 in
> - * @out_unsupported: on return, number of characters of @utf8 that
> are not fully
> - * representable in @charset
> - *
> - * Returns: the size in bytes of the string if converted from UTF-8
> into @charset.
> - **/
> -guint
> -mm_charset_get_encoded_len (const char *utf8,
> - MMModemCharset charset,
> - guint *out_unsupported)
> + * Returns: %TRUE if the conversion is possible without errors,
> %FALSE otherwise.
> + */
> +gboolean
> +mm_charset_can_convert_to (const char *utf8,
> + MMModemCharset charset)
> {
> const char *p = utf8;
> - guint len = 0, unsupported = 0;
> SubsetEntry *e;
>
> - g_return_val_if_fail (charset != MM_MODEM_CHARSET_UNKNOWN, 0);
> - g_return_val_if_fail (utf8 != NULL, 0);
> + g_return_val_if_fail (charset != MM_MODEM_CHARSET_UNKNOWN,
> FALSE);
> + g_return_val_if_fail (utf8 != NULL, FALSE);
>
> if (charset == MM_MODEM_CHARSET_UTF8)
> - return strlen (utf8);
> + return TRUE;
>
> /* Find the charset in our subset table */
> for (e = &subset_table[0];
> e->cs != charset && e->cs != MM_MODEM_CHARSET_UNKNOWN;
> e++);
> - g_return_val_if_fail (e->cs != MM_MODEM_CHARSET_UNKNOWN, 0);
> + g_return_val_if_fail (e->cs != MM_MODEM_CHARSET_UNKNOWN, FALSE);
>
> while (*p) {
> gunichar c;
> const char *end;
> - guint clen = 0;
>
> c = g_utf8_get_char_validated (p, -1);
> g_return_val_if_fail (c != (gunichar) -1, 0);
> @@ -625,15 +609,13 @@ mm_charset_get_encoded_len (const char *utf8,
> while (*++end);
> }
>
> - if (!e->func (c, p, (end - p), &clen))
> - unsupported++;
> - len += clen;
> + if (!e->func (c, p, (end - p)))
> + return FALSE;
> +
> p = end;
> }
>
> - if (out_unsupported)
> - *out_unsupported = unsupported;
> - return len;
> + return TRUE;
> }
>
> guint8 *
> diff --git a/src/mm-charsets.h b/src/mm-charsets.h
> index a85f3dc7..f738fa34 100644
> --- a/src/mm-charsets.h
> +++ b/src/mm-charsets.h
> @@ -57,10 +57,9 @@ guint8 *mm_charset_utf8_to_unpacked_gsm (const
> char *utf8, guint32 *out_len);
>
> guint8 *mm_charset_gsm_unpacked_to_utf8 (const guint8 *gsm, guint32
> len);
>
> -/* Returns the size in bytes required to hold the UTF-8 string in
> the given charset */
> -guint mm_charset_get_encoded_len (const char *utf8,
> - MMModemCharset charset,
> - guint *out_unsupported);
> +/* Checks whether conversion to the given charset may be done
> without errors */
> +gboolean mm_charset_can_convert_to (const char *utf8,
> + MMModemCharset charset);
>
> guint8 *gsm_unpack (const guint8 *gsm,
> guint32 num_septets,
> diff --git a/src/mm-sms-part-3gpp.c b/src/mm-sms-part-3gpp.c
> index 3f4d6c06..c413a4d0 100644
> --- a/src/mm-sms-part-3gpp.c
> +++ b/src/mm-sms-part-3gpp.c
> @@ -1026,7 +1026,6 @@ gchar **
> mm_sms_part_3gpp_util_split_text (const gchar *text,
> MMSmsEncoding *encoding)
> {
> - guint gsm_unsupported = 0;
> gchar **out;
> guint n_chunks;
> guint i;
> @@ -1058,10 +1057,7 @@ mm_sms_part_3gpp_util_split_text (const gchar
> *text,
> */
>
> /* Check if we can do GSM encoding */
> - mm_charset_get_encoded_len (text,
> - MM_MODEM_CHARSET_GSM,
> - &gsm_unsupported);
> - if (gsm_unsupported > 0) {
> + if (!mm_charset_can_convert_to (text, MM_MODEM_CHARSET_GSM)) {
> /* If cannot do it in GSM encoding, do it in UCS-2 */
> GByteArray *array;
>
> diff --git a/src/mm-sms-part-cdma.c b/src/mm-sms-part-cdma.c
> index 8d76bcec..167eda83 100644
> --- a/src/mm-sms-part-cdma.c
> +++ b/src/mm-sms-part-cdma.c
> @@ -1365,7 +1365,6 @@ decide_best_encoding (const gchar *text,
> guint *num_bits_per_field,
> Encoding *encoding)
> {
> - guint latin_unsupported = 0;
> guint ascii_unsupported = 0;
> guint i;
> guint len;
> @@ -1391,10 +1390,7 @@ decide_best_encoding (const gchar *text,
> }
>
> /* Check if we can do Latin encoding */
> - mm_charset_get_encoded_len (text,
> - MM_MODEM_CHARSET_8859_1,
> - &latin_unsupported);
> - if (!latin_unsupported) {
> + if (mm_charset_can_convert_to (text, MM_MODEM_CHARSET_8859_1)) {
> *out = g_byte_array_sized_new (len);
> mm_modem_charset_byte_array_append (*out,
> text,
> --
> 2.13.1
More information about the ModemManager-devel
mailing list