[PATCH] charsets: simplify check to see if conversion to charset possible
Aleksander Morgado
aleksander at aleksander.es
Tue Aug 15 21:25:46 UTC 2017
Instead of having a method that returns the expected length after the
conversion and the amount of input UTF-8 characters that couldn't be
converted to the given charset, simplify the logic and just define a
method that returns a boolean specifying whether the conversion is
possible or not.
---
Hey,
How about this one? It really is asking for unit tests, truth be told. Will do those next.
---
src/mm-charsets.c | 66 ++++++++++++++++++--------------------------------
src/mm-charsets.h | 7 +++---
src/mm-sms-part-3gpp.c | 6 +----
src/mm-sms-part-cdma.c | 6 +----
4 files changed, 29 insertions(+), 56 deletions(-)
diff --git a/src/mm-charsets.c b/src/mm-charsets.c
index fb47b0ae..108e3ab6 100644
--- a/src/mm-charsets.c
+++ b/src/mm-charsets.c
@@ -463,43 +463,37 @@ mm_charset_utf8_to_unpacked_gsm (const char *utf8, guint32 *out_len)
}
static gboolean
-gsm_is_subset (gunichar c, const char *utf8, gsize ulen, guint *out_clen)
+gsm_is_subset (gunichar c, const char *utf8, gsize ulen)
{
guint8 gsm;
- *out_clen = 1;
if (utf8_to_gsm_def_char (utf8, ulen, &gsm))
return TRUE;
- if (utf8_to_gsm_ext_char (utf8, ulen, &gsm)) {
- *out_clen = 2;
+ if (utf8_to_gsm_ext_char (utf8, ulen, &gsm))
return TRUE;
- }
return FALSE;
}
static gboolean
-ira_is_subset (gunichar c, const char *utf8, gsize ulen, guint *out_clen)
+ira_is_subset (gunichar c, const char *utf8, gsize ulen)
{
- *out_clen = 1;
return (ulen == 1);
}
static gboolean
-ucs2_is_subset (gunichar c, const char *utf8, gsize ulen, guint *out_clen)
+ucs2_is_subset (gunichar c, const char *utf8, gsize ulen)
{
- *out_clen = 2;
return (c <= 0xFFFF);
}
static gboolean
-iso88591_is_subset (gunichar c, const char *utf8, gsize ulen, guint *out_clen)
+iso88591_is_subset (gunichar c, const char *utf8, gsize ulen)
{
- *out_clen = 1;
return (c <= 0xFF);
}
static gboolean
-pccp437_is_subset (gunichar c, const char *utf8, gsize ulen, guint *out_clen)
+pccp437_is_subset (gunichar c, const char *utf8, gsize ulen)
{
static const gunichar t[] = {
0x00c7, 0x00fc, 0x00e9, 0x00e2, 0x00e4, 0x00e0, 0x00e5, 0x00e7, 0x00ea,
@@ -520,8 +514,6 @@ pccp437_is_subset (gunichar c, const char *utf8, gsize ulen, guint *out_clen)
};
int i;
- *out_clen = 1;
-
if (c <= 0x7F)
return TRUE;
for (i = 0; i < sizeof (t) / sizeof (t[0]); i++) {
@@ -532,7 +524,7 @@ pccp437_is_subset (gunichar c, const char *utf8, gsize ulen, guint *out_clen)
}
static gboolean
-pcdn_is_subset (gunichar c, const char *utf8, gsize ulen, guint *out_clen)
+pcdn_is_subset (gunichar c, const char *utf8, gsize ulen)
{
static const gunichar t[] = {
0x00c7, 0x00fc, 0x00e9, 0x00e2, 0x00e4, 0x00e0, 0x00e5, 0x00e7, 0x00ea,
@@ -553,8 +545,6 @@ pcdn_is_subset (gunichar c, const char *utf8, gsize ulen, guint *out_clen)
};
int i;
- *out_clen = 1;
-
if (c <= 0x7F)
return TRUE;
for (i = 0; i < sizeof (t) / sizeof (t[0]); i++) {
@@ -566,7 +556,7 @@ pcdn_is_subset (gunichar c, const char *utf8, gsize ulen, guint *out_clen)
typedef struct {
MMModemCharset cs;
- gboolean (*func) (gunichar c, const char *utf8, gsize ulen, guint *out_clen);
+ gboolean (*func) (gunichar c, const char *utf8, gsize ulen);
guint charsize;
} SubsetEntry;
@@ -581,40 +571,34 @@ SubsetEntry subset_table[] = {
};
/**
- * mm_charset_get_encoded_len:
+ * mm_charset_can_covert_to:
+ * @utf8: UTF-8 valid string.
+ * @charset: the #MMModemCharset to validate the conversion from @utf8.
*
- * @utf8: UTF-8 valid string
- * @charset: the #MMModemCharset to check the length of @utf8 in
- * @out_unsupported: on return, number of characters of @utf8 that are not fully
- * representable in @charset
- *
- * Returns: the size in bytes of the string if converted from UTF-8 into @charset.
- **/
-guint
-mm_charset_get_encoded_len (const char *utf8,
- MMModemCharset charset,
- guint *out_unsupported)
+ * Returns: %TRUE if the conversion is possible without errors, %FALSE otherwise.
+ */
+gboolean
+mm_charset_can_convert_to (const char *utf8,
+ MMModemCharset charset)
{
const char *p = utf8;
- guint len = 0, unsupported = 0;
SubsetEntry *e;
- g_return_val_if_fail (charset != MM_MODEM_CHARSET_UNKNOWN, 0);
- g_return_val_if_fail (utf8 != NULL, 0);
+ g_return_val_if_fail (charset != MM_MODEM_CHARSET_UNKNOWN, FALSE);
+ g_return_val_if_fail (utf8 != NULL, FALSE);
if (charset == MM_MODEM_CHARSET_UTF8)
- return strlen (utf8);
+ return TRUE;
/* Find the charset in our subset table */
for (e = &subset_table[0];
e->cs != charset && e->cs != MM_MODEM_CHARSET_UNKNOWN;
e++);
- g_return_val_if_fail (e->cs != MM_MODEM_CHARSET_UNKNOWN, 0);
+ g_return_val_if_fail (e->cs != MM_MODEM_CHARSET_UNKNOWN, FALSE);
while (*p) {
gunichar c;
const char *end;
- guint clen = 0;
c = g_utf8_get_char_validated (p, -1);
g_return_val_if_fail (c != (gunichar) -1, 0);
@@ -625,15 +609,13 @@ mm_charset_get_encoded_len (const char *utf8,
while (*++end);
}
- if (!e->func (c, p, (end - p), &clen))
- unsupported++;
- len += clen;
+ if (!e->func (c, p, (end - p)))
+ return FALSE;
+
p = end;
}
- if (out_unsupported)
- *out_unsupported = unsupported;
- return len;
+ return TRUE;
}
guint8 *
diff --git a/src/mm-charsets.h b/src/mm-charsets.h
index a85f3dc7..f738fa34 100644
--- a/src/mm-charsets.h
+++ b/src/mm-charsets.h
@@ -57,10 +57,9 @@ guint8 *mm_charset_utf8_to_unpacked_gsm (const char *utf8, guint32 *out_len);
guint8 *mm_charset_gsm_unpacked_to_utf8 (const guint8 *gsm, guint32 len);
-/* Returns the size in bytes required to hold the UTF-8 string in the given charset */
-guint mm_charset_get_encoded_len (const char *utf8,
- MMModemCharset charset,
- guint *out_unsupported);
+/* Checks whether conversion to the given charset may be done without errors */
+gboolean mm_charset_can_convert_to (const char *utf8,
+ MMModemCharset charset);
guint8 *gsm_unpack (const guint8 *gsm,
guint32 num_septets,
diff --git a/src/mm-sms-part-3gpp.c b/src/mm-sms-part-3gpp.c
index 3f4d6c06..c413a4d0 100644
--- a/src/mm-sms-part-3gpp.c
+++ b/src/mm-sms-part-3gpp.c
@@ -1026,7 +1026,6 @@ gchar **
mm_sms_part_3gpp_util_split_text (const gchar *text,
MMSmsEncoding *encoding)
{
- guint gsm_unsupported = 0;
gchar **out;
guint n_chunks;
guint i;
@@ -1058,10 +1057,7 @@ mm_sms_part_3gpp_util_split_text (const gchar *text,
*/
/* Check if we can do GSM encoding */
- mm_charset_get_encoded_len (text,
- MM_MODEM_CHARSET_GSM,
- &gsm_unsupported);
- if (gsm_unsupported > 0) {
+ if (!mm_charset_can_convert_to (text, MM_MODEM_CHARSET_GSM)) {
/* If cannot do it in GSM encoding, do it in UCS-2 */
GByteArray *array;
diff --git a/src/mm-sms-part-cdma.c b/src/mm-sms-part-cdma.c
index 8d76bcec..167eda83 100644
--- a/src/mm-sms-part-cdma.c
+++ b/src/mm-sms-part-cdma.c
@@ -1365,7 +1365,6 @@ decide_best_encoding (const gchar *text,
guint *num_bits_per_field,
Encoding *encoding)
{
- guint latin_unsupported = 0;
guint ascii_unsupported = 0;
guint i;
guint len;
@@ -1391,10 +1390,7 @@ decide_best_encoding (const gchar *text,
}
/* Check if we can do Latin encoding */
- mm_charset_get_encoded_len (text,
- MM_MODEM_CHARSET_8859_1,
- &latin_unsupported);
- if (!latin_unsupported) {
+ if (mm_charset_can_convert_to (text, MM_MODEM_CHARSET_8859_1)) {
*out = g_byte_array_sized_new (len);
mm_modem_charset_byte_array_append (*out,
text,
--
2.13.1
More information about the ModemManager-devel
mailing list