[poppler] reconsideration about iconv() in cpp frontend

Mon Apr 16 20:33:02 UTC 2018

El dilluns, 16 d’abril de 2018, a les 19:11:01 CEST, Adam Reichold va 
escriure:
> Hello again,
> 
> I really think that poppler::ustring defined as
> 
> std::basic_string<unsigned short>
> 
> is already broken where sizeof(unsigned short) != 2 and hence should be
> redefined to
> 
> std::basic_string<std::uint16_t>
> 
> without considering that an API break.

Sure, is there any system where sizeof(unsigned short) != 2 anyway?

Cheers,
  Albert

> This then also allows to just use
> these buffers acquired using ustring::data to be used with the
> conversion functions from UTF.h.
> 
> Best regards, Adam.
> 
> Am 16.04.2018 um 18:44 schrieb suzuki toshiya:
> > Adam Reichold wrote:
> >>> A) use the functions in UTF.h, and change ustring class
> >>> to fit them.
> >>> 
> >>> B) rewrite the functions in UTF.h by template programming
> >>> and have the interfaces fitting to current ustring().
> >>> 
> >>> C) write diversions of the functions in UTF.h in cpp frontend.
> >> 
> >> I think what is best here is hard to decide without seeing actual code
> >> that at least starts in one of these directions.
> > 
> > Indeed. In fact, yesterday I was trying to write a sample
> > patch for B) direction, but I got stuck. Because current UTF.h
> > is written in C-style (*p++ = c style copy/concatenation).
> > Thus, if I try to pass std::basic_string or std::vector
> > object, I have to make yet another template (and different
> > implementations) for the concatenation.
> > 
> > Following is current my proposal, which still use reinterpret_cast<>(),
> > but if it is not good (e.g. unsigned short != uint16_t), it allocates
> > the buffer and free it internally. By replacing iconv() by UTF.h
> > functions, I keep from the cast between unsigned short (with unknown
> > byte order) array and char array.
> > 
> > Please let me explain...
> > 
> >> --- a/cpp/poppler-global.cpp
> >> +++ b/cpp/poppler-global.cpp
> >> @@ -26,6 +26,7 @@
> >> 
> >>  #include "poppler-private.h"
> >>  
> >>  #include "DateInfo.h"
> >> 
> >> +#include "UTF.h"
> >> 
> >>  #include <algorithm>
> > 
> > Include "UTF.h" to use UTF-8 <-> UTF-16 helpers in there.
> > 
> >> @@ -34,31 +35,8 @@
> >> 
> >>  #include <ios>
> >>  #include <iostream>
> >> 
> >> -#include <iconv.h>
> >> -
> >> 
> >>  #include "config.h"
> >> 
> >> -namespace
> >> -{
> >> -
> >> -struct MiniIconv
> >> -{
> >> -    MiniIconv(const char *to_code, const char *from_code)
> >> -        : i_(iconv_open(to_code, from_code))
> >> -    {}
> >> -    ~MiniIconv()
> >> -    { if (is_valid()) iconv_close(i_); }
> >> -    MiniIconv(const MiniIconv &) = delete;
> >> -    MiniIconv& operator=(const MiniIconv &) = delete;
> >> -    bool is_valid() const
> >> -    { return i_ != (iconv_t)-1; }
> >> -    operator iconv_t() const
> >> -    { return i_; }
> >> -    iconv_t i_;
> >> -};
> >> -
> >> -}
> >> -
> > 
> > Remove iconv() related functions.
> > 
> >> @@ -226,28 +204,31 @@ byte_array ustring::to_utf8() const
> >> 
> >>          return byte_array();
> >>      
> >>      }
> >> 
> >> -    MiniIconv ic("UTF-8", "UTF-16");
> >> -    if (!ic.is_valid()) {
> >> -        return byte_array();
> > 
> > Almost replacement of ustring::to_utf8(). See below.
> > 
> >> +#if 0xFFFFU == USHRT_MAX
> >> +    const uint16_t* utf16_buff = reinterpret_cast<const uint16_t
> >> *>(data()); +#else
> > 
> > above is the popular case that uint16_t == unsigned int.
> > simple casting is enough to get uint16_t array from
> > ustring object.
> > 
> >> +    const uint16_t* utf16_begin = new uint16_t[utf16_len];
> >> +    const uint16_t* utf16_buff = utf16_begin;
> >> +    const unsigned short *me = data();
> >> +
> >> +    for (int i = 0; i < size(); i++) {
> >> +        utf16_buff[i] = (uint16_t)me[i];
> >> 
> >>      }
> > 
> > [snip the removal of existing code]
> > 
> >> +#endif
> > 
> > above is the irregular case that uint16_t != unsigned int.
> > allocate temporal array of uint16_t and copy the contents.
> > for skipping of BOM in the beginning, the heaf of allocated
> > buffer is recorded in utf16_begin, but the UTF-16 data to
> > be converted starts from utf16_buff.
> > 
> >> +    const uint16_t* utf16_end = utf16_buff + size();
> >> +    if (utf16_buff[0] == 0xFEFF) {
> >> +        utf16_buff ++;
> >> 
> >>      }
> > 
> > in abovem utf16_buff is shifted if the beginning of the
> > buffer is BOM.
> > 
> >> -    str.resize(str.size() - str_len_left);
> >> -    return str;
> >> +
> >> +    const int utf8_len = utf16CountUtf8Bytes(utf16_buff);
> >> +    byte_array ret(utf8_len + 1, '\0');
> > 
> > count the required buffer size by utf16CountUtf8Bytes in UTF.h,
> > and allocate the byte_array to be returned. To assue the null
> > termination, the size is incremented and filled by '\0'.
> > 
> >> +    utf16ToUtf8(utf16_buff, reinterpret_cast<char *>(ret.data()),
> >> utf8_len + 1, utf16_end - utf16_buff); +   
> >> ret.resize(std::strlen(ret.data()));
> > 
> > convert by utf16ToUtf8(), and shrink to fit converted result.
> > 
> >> +
> >> +#if 0xFFFFU != USHRT_MAX
> >> +    delete utf16_begin;
> >> +#endif
> >> +    return ret;
> >> 
> >>  }
> > 
> > if irregular case, the temporal buffer should be freed.
> > 
> >> @@ -256,15 +237,29 @@ std::string ustring::to_latin1() const
> >> 
> >>          return std::string();
> >>      
> >>      }
> >> 
> >> -    const size_type mylength = size();
> >> -    std::string ret(mylength, '\0');
> >> 
> >>      const value_type *me = data();
> >> 
> >> -    for (size_type i = 0; i < mylength; ++i) {
> >> +    const value_type *me_end = me + size();
> >> +    if (*me == 0xFEFF) {
> >> +        me ++;
> >> +    }
> >> +    std::string ret(me_end - me, '\0');
> >> +    for (size_type i = 0; me < me_end; ++i) {
> >> 
> >>          ret[i] = (char)*me++;
> >>      
> >>      }
> >>      return ret;
> >>  
> >>  }
> > 
> > original implementation of ustring::to_latin() does not
> > care for BOM. now BOM is checked and skipped if there is.
> > 
> >> +static bool has_bom_utf8(const char *c, int len)
> >> +{
> >> +    if ( 3 > len ) {
> >> +        return false;
> >> +    }
> >> +    if (c[0] == 0xEF && c[1] == 0xBB && c[2] == 0xBF) {
> >> +        return true;
> >> +    }
> >> +    return false;
> >> +}
> >> +
> >> 
> >>  ustring ustring::from_utf8(const char *str, int len)
> >>  {
> >>  
> >>      if (len <= 0) {
> > 
> > even if ustring::from_utf8 receives UTF-8 string with BOM,
> > created ustring object should not include BOM. has_bom_utf8()
> > is an utility function for it.
> > 
> >> @@ -273,31 +268,27 @@ ustring ustring::from_utf8(const char *str, int
> >> len)
> >> 
> >>              return ustring();
> >>          
> >>          }
> >>      
> >>      }
> >> 
> >> -
> >> -    MiniIconv ic("UTF-16", "UTF-8");
> >> -    if (!ic.is_valid()) {
> >> -        return ustring();
> > 
> > remove iconv() related function.
> > 
> >> +    int utf16_count = utf8CountUtf16CodeUnits(str);
> >> +    if (has_bom_utf8(str, len)) {
> >> +        str += 3;       // skip BOM
> >> +        len -= 3;       // skip BOM
> >> +        utf16_count --; // skip BOM
> >> 
> >>      }
> > 
> > if UTF-8 starts with BOM, it is skiped, the length of
> > UTF-8 buffer is decremented (3 octets), the length of
> > UTF-16 buffer is decremented (16bit).
> > 
> >> +    ustring ret(utf16_count + 1, 0);
> > 
> > allocate ustring object to be returned. to assure the
> > U+0000l termination, the size is incremented.
> > 
> >> +#if 0xFFFFU == USHRT_MAX
> >> +    const uint16_t* utf16_buff = reinterpret_cast<const uint16_t
> >> *>(ret.data());> 
> > this is for popular case where uint16_t == unsigned short.
> > simple cast is enough to make the array of uint16_t.
> > 
> >> +#else
> >> +    const uint16_t* utf16_buff = new uint16_t[utf16_count + 1](0);
> >> +#endif
> > 
> > this is for irregular case. allocate temporal buffer to store
> > uint16_t data.
> > 
> >> +    utf8ToUtf16(str, (uint16_t*)utf16_buff, utf16_count + 1, len);
> > 
> > fill utf16_buff by converted UTF-16 data.
> > 
> >> +#if 0xFFFFU != USHRT_MAX
> >> +    for (int i = 0; i < utf16_count; i ++) {
> >> +        ret[i] = (unsigned short)utf16_buff[i];
> >> 
> >>      }
> > 
> > in irregular case, the content should be copied to the object
> > to be returned.
> > 
> >> +    delete utf16_buff;
> >> +#endif
> > 
> > the buffer for irregular case is freed here.
> > 
> >>      return ret;
> >>  
> >>  }
> >> 
> >> --- a/cpp/poppler-page.cpp
> >> +++ b/cpp/poppler-page.cpp
> >> @@ -355,7 +365,7 @@ std::vector<text_box> page::text_list() const
> >> 
> >>              TextWord *word = word_list->get(i);
> >>              
> >>              std::unique_ptr<GooString> gooWord{word->getText()};
> >> 
> >> -            ustring ustr =
> >> detail::unicode_GooString_to_ustring(gooWord.get()); +           
> >> ustring ustr = ustring::from_utf8(gooWord->getCString());> 
> > here, gooWord is the object created by TextOutputDev,
> > so it is always UTF-8. it should be passed to ustring::from_utf8().
> > 
> >> --- a/cpp/poppler-private.cpp
> >> +++ b/cpp/poppler-private.cpp
> >> @@ -23,6 +23,7 @@
> >> 
> >>  #include "poppler-private.h"
> >>  
> >>  #include "GooString.h"
> >> 
> >> +#include "PDFDocEncoding.h"
> >> 
> >>  #include "Page.h"
> >>  
> >>  #include <ctime>
> > 
> > the strings for PDF metadata is UTF-16BE with BOM or PDFDocEncoding.
> > to use PDFDocEncoding helper, include PDFDocEncoding.h
> > 
> >> @@ -58,16 +59,16 @@ rectf detail::pdfrectangle_to_rectf(const
> >> PDFRectangle &pdfrect)>> 
> >>  ustring detail::unicode_GooString_to_ustring(const GooString *str)
> >>  {
> >> 
> >> -    const char *data = str->getCString();
> >> 
> >>      const int len = str->getLength();
> >> 
> >> +    const char *data = str->getCString();
> >> +    const char *data_end = data + len;
> > 
> > input data may or may not have BOM, so the length of UTF-8 string
> > to be converted may or may not be len. to clarify the end of buffer,
> > data_end is calculated, instead of the index "i" in the existing
> > implementation.
> > 
> >> -    int i = 0;
> >> 
> >>      bool is_unicode = false;
> >>      if ((data[0] & 0xff) == 0xfe && (len > 1 && (data[1] & 0xff) ==
> >>      0xff)) {
> >>      
> >>          is_unicode = true;
> >> 
> >> -        i = 2;
> >> +        data += 2; // skip BOM
> >> 
> >>      }
> > 
> > instead of the index i, the beginning of UTF-16BE buffer is shifted
> > to skip BOM.
> > 
> >> -    ustring::size_type ret_len = len - i;
> >> +    ustring::size_type ret_len = data_end - data;
> >> 
> >>      if (is_unicode) {
> >>      
> >>          ret_len >>= 1;
> >>      
> >>      }
> > 
> > ditto.
> > 
> >> @@ -75,15 +76,15 @@ ustring detail::unicode_GooString_to_ustring(const
> >> GooString *str)>> 
> >>      size_t ret_index = 0;
> >>      ustring::value_type u;
> >>      if (is_unicode) {
> >> 
> >> -        while (i < len) {
> >> -            u = ((data[i] & 0xff) << 8) | (data[i + 1] & 0xff);
> >> -            i += 2;
> >> +        while (data < data_end) {
> >> +            u = ((*data & 0xff) << 8) | (*(data + 1) & 0xff);
> >> +            data += 2;
> >> 
> >>              ret[ret_index++] = u;
> >>          
> >>          }
> >>      
> >>      } else {
> >> 
> >> -        while (i < len) {
> >> -            u = data[i] & 0xff;
> >> -            ++i;
> >> +        while (data < data_end) {
> >> +            u = pdfDocEncoding[ *data & 0xff ];
> >> +            data ++;
> >> 
> >>              ret[ret_index++] = u;
> >>          
> >>          }
> >>      
> >>      }
> > 
> > use *data instead of data[i].