[Libreoffice-commits] core.git: Branch 'distro/collabora/cp-6.4' - include/vcl vcl/source

Tomaž Vajngerl (via logerrit) logerrit at kemper.freedesktop.org
Fri Nov 13 10:09:04 UTC 2020


 include/vcl/filter/pdfdocument.hxx     |  141 +++++
 vcl/source/filter/ipdf/pdfdocument.cxx |  834 +++++++++++++++++----------------
 2 files changed, 587 insertions(+), 388 deletions(-)

New commits:
commit 3dec064d663decbcd5768d2abf81e848df621808
Author:     Tomaž Vajngerl <tomaz.vajngerl at collabora.co.uk>
AuthorDate: Tue Nov 10 09:43:20 2020 +0100
Commit:     Andras Timar <andras.timar at collabora.com>
CommitDate: Fri Nov 13 11:08:19 2020 +0100

    pdf: move parsing into it's own class, rewrite the parser
    
    This moves the parser into it's own class as it is not only
    limited to dictionaries.
    
    The parser has been rewritten to handle the array elements
    correctly and properly. Previous array elements were filled during
    the tokenization step, which is wrong, and the arrays weren't
    parsed recursively, making the parsing incomplete in some cases.
    This rewrite handles arrays similar to dictionaries and thus
    allows them to be parsed properly.
    
    All the sub-classes of PDFElement have been moved into the
    header file.
    
    Another change is also to not have a separate input dictionary
    for the root object and instead always look into the dictionary
    element that should be available instead.
    
    + many other smaller changes
    
    Change-Id: I7fcf94760967bbd1474a0b432ba3a4e3c9b7cabe
    Reviewed-on: https://gerrit.libreoffice.org/c/core/+/105780
    Tested-by: Jenkins CollaboraOffice <jenkinscollaboraoffice at gmail.com>
    Reviewed-by: Andras Timar <andras.timar at collabora.com>

diff --git a/include/vcl/filter/pdfdocument.hxx b/include/vcl/filter/pdfdocument.hxx
index b1cf34856a95..bd88d56e674d 100644
--- a/include/vcl/filter/pdfdocument.hxx
+++ b/include/vcl/filter/pdfdocument.hxx
@@ -91,7 +91,6 @@ class VCL_DLLPUBLIC PDFObjectElement final : public PDFElement
     PDFDocument& m_rDoc;
     double m_fObjectValue;
     double m_fGenerationValue;
-    std::map<OString, PDFElement*> m_aDictionary;
     /// If set, the object contains this number element (outside any dictionary/array).
     PDFNumberElement* m_pNumberElement;
     /// Position after the '<<' token.
@@ -117,6 +116,8 @@ class VCL_DLLPUBLIC PDFObjectElement final : public PDFElement
     /// nested dictionaries.
     std::vector<PDFReferenceElement*> m_aDictionaryReferences;
 
+    bool m_bParsed;
+
     void parseIfNecessary();
 
 public:
@@ -145,7 +146,7 @@ public:
     sal_uInt64 GetArrayOffset() const;
     void SetArrayLength(sal_uInt64 nArrayLength);
     sal_uInt64 GetArrayLength() const;
-    PDFArrayElement* GetArray() const;
+    PDFArrayElement* GetArray();
     /// Parse objects stored in this object stream.
     void ParseStoredObjects();
     std::vector<std::unique_ptr<PDFElement>>& GetStoredElements();
@@ -168,6 +169,7 @@ public:
     bool Read(SvStream& rStream) override;
     void PushBack(PDFElement* pElement);
     const std::vector<PDFElement*>& GetElements() const;
+    PDFElement* GetElement(size_t nIndex) const { return m_aElements[nIndex]; }
 
     void writeString(OStringBuffer& rBuffer) override
     {
@@ -246,9 +248,10 @@ class VCL_DLLPUBLIC PDFNameElement final : public PDFElement
 public:
     PDFNameElement();
     bool Read(SvStream& rStream) override;
+    void SetValue(const OString& rValue) { m_aValue = rValue; }
     const OString& GetValue() const;
     sal_uInt64 GetLocation() const;
-    static sal_uInt64 GetLength() { return 0; }
+    sal_uInt64 GetLength() { return m_aValue.getLength(); }
 
     void writeString(OStringBuffer& rBuffer) override
     {
@@ -273,8 +276,6 @@ public:
     PDFDictionaryElement();
     bool Read(SvStream& rStream) override;
 
-    static size_t Parse(const std::vector<std::unique_ptr<PDFElement>>& rElements,
-                        PDFElement* pThis, std::map<OString, PDFElement*>& rDictionary);
     static PDFElement* Lookup(const std::map<OString, PDFElement*>& rDictionary,
                               const OString& rKey);
     void SetKeyOffset(const OString& rKey, sal_uInt64 nOffset);
@@ -286,6 +287,11 @@ public:
     PDFObjectElement* LookupObject(const OString& rDictionaryKey);
     /// Looks up an element which is contained in this dictionary.
     PDFElement* LookupElement(const OString& rDictionaryKey);
+    sal_uInt64 GetLocation() const { return m_nLocation; }
+    void insert(OString const& rKey, PDFElement* pPDFElement)
+    {
+        m_aItems.emplace(rKey, pPDFElement);
+    }
 
     void writeString(OStringBuffer& rBuffer) override
     {
@@ -402,12 +408,100 @@ public:
     PDFNumberElement();
     bool Read(SvStream& rStream) override;
     double GetValue() const;
+    void SetValue(double fValue) { m_fValue = fValue; }
+
     sal_uInt64 GetLocation() const;
     sal_uInt64 GetLength() const;
 
     void writeString(OStringBuffer& rBuffer) override { rBuffer.append(m_fValue); }
 };
 
+/// A one-liner comment.
+class VCL_DLLPUBLIC PDFCommentElement : public PDFElement
+{
+    PDFDocument& m_rDoc;
+    OString m_aComment;
+
+public:
+    explicit PDFCommentElement(PDFDocument& rDoc);
+    bool Read(SvStream& rStream) override;
+    void writeString(OStringBuffer& /*rBuffer*/) override {}
+};
+
+/// End of a dictionary: '>>'.
+class VCL_DLLPUBLIC PDFEndDictionaryElement : public PDFElement
+{
+    /// Offset before the '>>' token.
+    sal_uInt64 m_nLocation = 0;
+
+public:
+    PDFEndDictionaryElement();
+    bool Read(SvStream& rStream) override;
+    sal_uInt64 GetLocation() const;
+
+    void writeString(OStringBuffer& /*rBuffer*/) override {}
+};
+
+/// End of a stream: 'endstream' keyword.
+class VCL_DLLPUBLIC PDFEndStreamElement : public PDFElement
+{
+public:
+    bool Read(SvStream& rStream) override;
+
+    void writeString(OStringBuffer& /*rBuffer*/) override {}
+};
+
+/// End of an object: 'endobj' keyword.
+class VCL_DLLPUBLIC PDFEndObjectElement : public PDFElement
+{
+public:
+    bool Read(SvStream& rStream) override;
+
+    void writeString(OStringBuffer& /*rBuffer*/) override {}
+};
+
+/// End of an array: ']'.
+class VCL_DLLPUBLIC PDFEndArrayElement : public PDFElement
+{
+    /// Location before the ']' token.
+    sal_uInt64 m_nOffset = 0;
+
+public:
+    PDFEndArrayElement();
+    bool Read(SvStream& rStream) override;
+    sal_uInt64 GetOffset() const;
+
+    void writeString(OStringBuffer& /*rBuffer*/) override {}
+};
+
+/// Boolean object: a 'true' or a 'false'.
+class VCL_DLLPUBLIC PDFBooleanElement : public PDFElement
+{
+    bool m_aValue;
+
+public:
+    explicit PDFBooleanElement(bool bValue)
+        : m_aValue(bValue)
+    {
+    }
+
+    bool Read(SvStream& rStream) override;
+
+    void writeString(OStringBuffer& rBuffer) override
+    {
+        rBuffer.append(m_aValue ? "true" : "false");
+    }
+};
+
+/// Null object: the 'null' singleton.
+class VCL_DLLPUBLIC PDFNullElement : public PDFElement
+{
+public:
+    bool Read(SvStream& rStream) override;
+
+    void writeString(OStringBuffer& rBuffer) override { rBuffer.append("null"); }
+};
+
 /**
  * In-memory representation of an on-disk PDF document.
  *
@@ -523,6 +617,43 @@ public:
     bool writeBuffer(const void* pBuffer, sal_uInt64 nBytes) override;
 };
 
+/// The trailer singleton is at the end of the doc.
+class VCL_DLLPUBLIC PDFTrailerElement : public PDFElement
+{
+    PDFDocument& m_rDoc;
+    PDFDictionaryElement* m_pDictionaryElement;
+    /// Location of the end of the trailer token.
+    sal_uInt64 m_nOffset = 0;
+
+public:
+    explicit PDFTrailerElement(PDFDocument& rDoc);
+    bool Read(SvStream& rStream) override;
+    PDFElement* Lookup(const OString& rDictionaryKey);
+    sal_uInt64 GetLocation() const;
+
+    void SetDictionary(PDFDictionaryElement* pDictionaryElement)
+    {
+        m_pDictionaryElement = pDictionaryElement;
+    }
+
+    PDFDictionaryElement* GetDictionary() { return m_pDictionaryElement; }
+
+    void writeString(OStringBuffer& /*rBuffer*/) override { assert(false && "not implemented"); }
+};
+
+class VCL_DLLPUBLIC PDFObjectParser final
+{
+    const std::vector<std::unique_ptr<PDFElement>>& mrElements;
+
+public:
+    PDFObjectParser(std::vector<std::unique_ptr<PDFElement>> const& rElements)
+        : mrElements(rElements)
+    {
+    }
+
+    size_t parse(PDFElement* pParsingElement, size_t nStartIndex = 0, int nCurrentDepth = 0);
+};
+
 } // namespace pdfio
 } // namespace xmlsecurity
 
diff --git a/vcl/source/filter/ipdf/pdfdocument.cxx b/vcl/source/filter/ipdf/pdfdocument.cxx
index f83d1614364e..701b96e38606 100644
--- a/vcl/source/filter/ipdf/pdfdocument.cxx
+++ b/vcl/source/filter/ipdf/pdfdocument.cxx
@@ -38,112 +38,6 @@ namespace filter
 {
 const int MAX_SIGNATURE_CONTENT_LENGTH = 50000;
 
-class PDFTrailerElement;
-
-/// A one-liner comment.
-class PDFCommentElement : public PDFElement
-{
-    PDFDocument& m_rDoc;
-    OString m_aComment;
-
-public:
-    explicit PDFCommentElement(PDFDocument& rDoc);
-    bool Read(SvStream& rStream) override;
-    void writeString(OStringBuffer& /*rBuffer*/) override {}
-};
-
-class PDFReferenceElement;
-
-/// End of a dictionary: '>>'.
-class PDFEndDictionaryElement : public PDFElement
-{
-    /// Offset before the '>>' token.
-    sal_uInt64 m_nLocation = 0;
-
-public:
-    PDFEndDictionaryElement();
-    bool Read(SvStream& rStream) override;
-    sal_uInt64 GetLocation() const;
-
-    void writeString(OStringBuffer& /*rBuffer*/) override {}
-};
-
-/// End of a stream: 'endstream' keyword.
-class PDFEndStreamElement : public PDFElement
-{
-public:
-    bool Read(SvStream& rStream) override;
-
-    void writeString(OStringBuffer& /*rBuffer*/) override {}
-};
-
-/// End of an object: 'endobj' keyword.
-class PDFEndObjectElement : public PDFElement
-{
-public:
-    bool Read(SvStream& rStream) override;
-
-    void writeString(OStringBuffer& /*rBuffer*/) override {}
-};
-
-/// End of an array: ']'.
-class PDFEndArrayElement : public PDFElement
-{
-    /// Location before the ']' token.
-    sal_uInt64 m_nOffset = 0;
-
-public:
-    PDFEndArrayElement();
-    bool Read(SvStream& rStream) override;
-    sal_uInt64 GetOffset() const;
-
-    void writeString(OStringBuffer& /*rBuffer*/) override {}
-};
-
-/// Boolean object: a 'true' or a 'false'.
-class PDFBooleanElement : public PDFElement
-{
-    bool m_aValue;
-
-public:
-    explicit PDFBooleanElement(bool bValue)
-        : m_aValue(bValue)
-    {
-    }
-
-    bool Read(SvStream& rStream) override;
-
-    void writeString(OStringBuffer& rBuffer) override
-    {
-        rBuffer.append(m_aValue ? "true" : "false");
-    }
-};
-
-/// Null object: the 'null' singleton.
-class PDFNullElement : public PDFElement
-{
-public:
-    bool Read(SvStream& rStream) override;
-
-    void writeString(OStringBuffer& rBuffer) override { rBuffer.append("null"); }
-};
-
-/// The trailer singleton is at the end of the doc.
-class PDFTrailerElement : public PDFElement
-{
-    PDFDocument& m_rDoc;
-    std::map<OString, PDFElement*> m_aDictionary;
-    /// Location of the end of the trailer token.
-    sal_uInt64 m_nOffset = 0;
-
-public:
-    explicit PDFTrailerElement(PDFDocument& rDoc);
-    bool Read(SvStream& rStream) override;
-    PDFElement* Lookup(const OString& rDictionaryKey);
-    sal_uInt64 GetLocation() const;
-
-    void writeString(OStringBuffer& /*rBuffer*/) override {}
-};
 
 XRefEntry::XRefEntry() = default;
 
@@ -638,6 +532,7 @@ bool PDFDocument::WriteCatalogObject(sal_Int32 nAnnotId, PDFReferenceElement*& p
         sal_uInt64 nFieldsEndOffset = pAcroFormDictionary->GetKeyOffset("Fields")
                                       + pAcroFormDictionary->GetKeyValueLength("Fields")
                                       - strlen("]");
+
         // Length of beginning of the object dictionary -> Fields end.
         sal_uInt64 nFieldsBeforeEndLength = nFieldsEndOffset;
         if (pStreamBuffer)
@@ -1090,13 +985,12 @@ bool PDFDocument::Tokenize(SvStream& rStream, TokenizeMode eMode,
     // The next number will be an xref offset.
     bool bInStartXRef = false;
     // Dictionary depth, so we know when we're outside any dictionaries.
-    int nDictionaryDepth = 0;
-    // Array depth, only the offset/length of the toplevel array is tracked.
-    int nArrayDepth = 0;
+    int nDepth = 0;
     // Last seen array token that's outside any dictionaries.
     PDFArrayElement* pArray = nullptr;
     // If we're inside an obj/endobj pair.
     bool bInObject = false;
+
     while (true)
     {
         char ch;
@@ -1133,7 +1027,7 @@ bool PDFDocument::Tokenize(SvStream& rStream, TokenizeMode eMode,
                 if (ch == '<')
                 {
                     rElements.push_back(std::unique_ptr<PDFElement>(new PDFDictionaryElement()));
-                    ++nDictionaryDepth;
+                    ++nDepth;
                 }
                 else
                     rElements.push_back(std::unique_ptr<PDFElement>(new PDFHexStringElement));
@@ -1148,7 +1042,7 @@ bool PDFDocument::Tokenize(SvStream& rStream, TokenizeMode eMode,
             case '>':
             {
                 rElements.push_back(std::unique_ptr<PDFElement>(new PDFEndDictionaryElement()));
-                --nDictionaryDepth;
+                --nDepth;
                 rStream.SeekRel(-1);
                 if (!rElements.back()->Read(rStream))
                 {
@@ -1162,7 +1056,7 @@ bool PDFDocument::Tokenize(SvStream& rStream, TokenizeMode eMode,
             {
                 auto pArr = new PDFArrayElement(pObject);
                 rElements.push_back(std::unique_ptr<PDFElement>(pArr));
-                if (nDictionaryDepth == 0 && nArrayDepth == 0)
+                if (nDepth == 0)
                 {
                     // The array is attached directly, inform the object.
                     pArray = pArr;
@@ -1172,7 +1066,7 @@ bool PDFDocument::Tokenize(SvStream& rStream, TokenizeMode eMode,
                         pObject->SetArrayOffset(rStream.Tell());
                     }
                 }
-                ++nArrayDepth;
+                ++nDepth;
                 rStream.SeekRel(-1);
                 if (!rElements.back()->Read(rStream))
                 {
@@ -1184,11 +1078,9 @@ bool PDFDocument::Tokenize(SvStream& rStream, TokenizeMode eMode,
             case ']':
             {
                 rElements.push_back(std::unique_ptr<PDFElement>(new PDFEndArrayElement()));
-                --nArrayDepth;
-                if (nArrayDepth == 0)
-                    pArray = nullptr;
+                --nDepth;
                 rStream.SeekRel(-1);
-                if (nDictionaryDepth == 0 && nArrayDepth == 0)
+                if (nDepth == 0)
                 {
                     if (pObject)
                     {
@@ -1213,6 +1105,7 @@ bool PDFDocument::Tokenize(SvStream& rStream, TokenizeMode eMode,
                     SAL_WARN("vcl.filter", "PDFDocument::Tokenize: PDFNameElement::Read() failed");
                     return false;
                 }
+
                 if (pObject && pObjectKey && pObjectKey->GetValue() == "Type"
                     && pNameElement->GetValue() == "ObjStm")
                     pObjectStream = pObject;
@@ -1256,7 +1149,7 @@ bool PDFDocument::Tokenize(SvStream& rStream, TokenizeMode eMode,
                         if (it != m_aOffsetObjects.end())
                             m_pXRefStream = it->second;
                     }
-                    else if (bInObject && !nDictionaryDepth && !nArrayDepth && pObject)
+                    else if (bInObject && !nDepth && pObject)
                         // Number element inside an object, but outside a
                         // dictionary / array: remember it.
                         pObject->SetNumberElement(pNumberElement);
@@ -1303,10 +1196,7 @@ bool PDFDocument::Tokenize(SvStream& rStream, TokenizeMode eMode,
                             auto pReference = new PDFReferenceElement(*this, *pObjectNumber,
                                                                       *pGenerationNumber);
                             rElements.push_back(std::unique_ptr<PDFElement>(pReference));
-                            if (pArray)
-                                // Reference is part of a direct (non-dictionary) array, inform the array.
-                                pArray->PushBack(rElements.back().get());
-                            if (bInObject && nDictionaryDepth > 0 && pObject)
+                            if (bInObject && nDepth > 0 && pObject)
                                 // Inform the object about a new in-dictionary reference.
                                 pObject->AddDictionaryReference(pReference);
                         }
@@ -2314,6 +2204,7 @@ const OString& PDFLiteralStringElement::GetValue() const { return m_aValue; }
 
 PDFTrailerElement::PDFTrailerElement(PDFDocument& rDoc)
     : m_rDoc(rDoc)
+    , m_pDictionaryElement(nullptr)
 {
 }
 
@@ -2325,10 +2216,14 @@ bool PDFTrailerElement::Read(SvStream& rStream)
 
 PDFElement* PDFTrailerElement::Lookup(const OString& rDictionaryKey)
 {
-    if (m_aDictionary.empty())
-        PDFDictionaryElement::Parse(m_rDoc.GetElements(), this, m_aDictionary);
-
-    return PDFDictionaryElement::Lookup(m_aDictionary, rDictionaryKey);
+    if (!m_pDictionaryElement)
+    {
+        PDFObjectParser aParser(m_rDoc.GetElements());
+        aParser.parse(this);
+    }
+    if (!m_pDictionaryElement)
+        return nullptr;
+    return m_pDictionaryElement->LookupElement(rDictionaryKey);
 }
 
 sal_uInt64 PDFTrailerElement::GetLocation() const { return m_nOffset; }
@@ -2347,6 +2242,7 @@ PDFObjectElement::PDFObjectElement(PDFDocument& rDoc, double fObjectValue, doubl
     , m_nArrayLength(0)
     , m_pArrayElement(nullptr)
     , m_pStreamElement(nullptr)
+    , m_bParsed(false)
 {
 }
 
@@ -2359,257 +2255,6 @@ bool PDFObjectElement::Read(SvStream& /*rStream*/)
 
 PDFDictionaryElement::PDFDictionaryElement() = default;
 
-size_t PDFDictionaryElement::Parse(const std::vector<std::unique_ptr<PDFElement>>& rElements,
-                                   PDFElement* pThis, std::map<OString, PDFElement*>& rDictionary)
-{
-    // The index of last parsed element, in case of nested dictionaries.
-    size_t nRet = 0;
-
-    if (!rDictionary.empty())
-        return nRet;
-
-    pThis->setParsing(true);
-
-    auto pThisObject = dynamic_cast<PDFObjectElement*>(pThis);
-    // This is set to non-nullptr here for nested dictionaries only.
-    auto pThisDictionary = dynamic_cast<PDFDictionaryElement*>(pThis);
-
-    // Find out where the dictionary for this object starts.
-    size_t nIndex = 0;
-    for (size_t i = 0; i < rElements.size(); ++i)
-    {
-        if (rElements[i].get() == pThis)
-        {
-            nIndex = i;
-            break;
-        }
-    }
-
-    OString aName;
-    sal_uInt64 nNameOffset = 0;
-    std::vector<PDFNumberElement*> aNumbers;
-    // The array value we're in -- if any.
-    PDFArrayElement* pArray = nullptr;
-    sal_uInt64 nDictionaryOffset = 0;
-    int nDictionaryDepth = 0;
-    // Toplevel dictionary found (not inside an array).
-    bool bDictionaryFound = false;
-    // Toplevel array found (not inside a dictionary).
-    bool bArrayFound = false;
-    for (size_t i = nIndex; i < rElements.size(); ++i)
-    {
-        // Dictionary tokens can be nested, track enter/leave.
-        if (auto pDictionary = dynamic_cast<PDFDictionaryElement*>(rElements[i].get()))
-        {
-            bDictionaryFound = true;
-            if (++nDictionaryDepth == 1)
-            {
-                // First dictionary start, track start offset.
-                nDictionaryOffset = pDictionary->m_nLocation;
-                if (pThisObject)
-                {
-                    if (!bArrayFound)
-                        // Then the toplevel dictionary of the object.
-                        pThisObject->SetDictionary(pDictionary);
-                    pThisDictionary = pDictionary;
-                    pThisObject->SetDictionaryOffset(nDictionaryOffset);
-                }
-            }
-            else if (!pDictionary->alreadyParsing())
-            {
-                // Nested dictionary.
-                const size_t nexti
-                    = PDFDictionaryElement::Parse(rElements, pDictionary, pDictionary->m_aItems);
-                if (nexti >= i) // ensure we go forwards and not endlessly loop
-                {
-                    i = nexti;
-                    if (pArray)
-                    {
-                        // Dictionary value inside an array.
-                        pArray->PushBack(pDictionary);
-                    }
-                    else
-                    {
-                        // Dictionary toplevel value.
-                        rDictionary[aName] = pDictionary;
-                        aName.clear();
-                    }
-                }
-            }
-        }
-
-        if (auto pEndDictionary = dynamic_cast<PDFEndDictionaryElement*>(rElements[i].get()))
-        {
-            if (--nDictionaryDepth == 0)
-            {
-                // Last dictionary end, track length and stop parsing.
-                if (pThisObject)
-                    pThisObject->SetDictionaryLength(pEndDictionary->GetLocation()
-                                                     - nDictionaryOffset);
-                nRet = i;
-                break;
-            }
-        }
-
-        auto pName = dynamic_cast<PDFNameElement*>(rElements[i].get());
-        if (pName)
-        {
-            if (!aNumbers.empty())
-            {
-                PDFNumberElement* pNumber = aNumbers.back();
-                rDictionary[aName] = pNumber;
-                if (pThisDictionary)
-                {
-                    pThisDictionary->SetKeyOffset(aName, nNameOffset);
-                    pThisDictionary->SetKeyValueLength(
-                        aName, pNumber->GetLocation() + pNumber->GetLength() - nNameOffset);
-                }
-                aName.clear();
-                aNumbers.clear();
-            }
-
-            if (aName.isEmpty())
-            {
-                // Remember key.
-                aName = pName->GetValue();
-                nNameOffset = pName->GetLocation();
-            }
-            else
-            {
-                if (pArray)
-                {
-                    if (bDictionaryFound)
-                        // Array inside dictionary.
-                        pArray->PushBack(pName);
-                }
-                else
-                {
-                    // Name-name key-value.
-                    rDictionary[aName] = pName;
-                    if (pThisDictionary)
-                    {
-                        pThisDictionary->SetKeyOffset(aName, nNameOffset);
-                        pThisDictionary->SetKeyValueLength(aName, pName->GetLocation()
-                                                                      + PDFNameElement::GetLength()
-                                                                      - nNameOffset);
-                    }
-                    aName.clear();
-                }
-            }
-            continue;
-        }
-
-        auto pArr = dynamic_cast<PDFArrayElement*>(rElements[i].get());
-        if (pArr)
-        {
-            bArrayFound = true;
-            pArray = pArr;
-            continue;
-        }
-
-        auto pEndArr = dynamic_cast<PDFEndArrayElement*>(rElements[i].get());
-        if (pArray && pEndArr)
-        {
-            for (auto& pNumber : aNumbers)
-                pArray->PushBack(pNumber);
-            aNumbers.clear();
-            rDictionary[aName] = pArray;
-            if (pThisDictionary)
-            {
-                pThisDictionary->SetKeyOffset(aName, nNameOffset);
-                // Include the ending ']' in the length of the key - (array)value pair length.
-                pThisDictionary->SetKeyValueLength(aName, pEndArr->GetOffset() - nNameOffset + 1);
-            }
-            aName.clear();
-            pArray = nullptr;
-            continue;
-        }
-
-        auto pReference = dynamic_cast<PDFReferenceElement*>(rElements[i].get());
-        if (pReference)
-        {
-            if (!pArray)
-            {
-                rDictionary[aName] = pReference;
-                if (pThisDictionary)
-                {
-                    pThisDictionary->SetKeyOffset(aName, nNameOffset);
-                    pThisDictionary->SetKeyValueLength(aName,
-                                                       pReference->GetOffset() - nNameOffset);
-                }
-                aName.clear();
-            }
-            else
-            {
-                if (bDictionaryFound)
-                    // Array inside dictionary.
-                    pArray->PushBack(pReference);
-            }
-            aNumbers.clear();
-            continue;
-        }
-
-        auto pLiteralString = dynamic_cast<PDFLiteralStringElement*>(rElements[i].get());
-        if (pLiteralString)
-        {
-            rDictionary[aName] = pLiteralString;
-            if (pThisDictionary)
-                pThisDictionary->SetKeyOffset(aName, nNameOffset);
-            aName.clear();
-            continue;
-        }
-
-        auto pBoolean = dynamic_cast<PDFBooleanElement*>(rElements[i].get());
-        if (pBoolean)
-        {
-            rDictionary[aName] = pBoolean;
-            if (pThisDictionary)
-                pThisDictionary->SetKeyOffset(aName, nNameOffset);
-            aName.clear();
-            continue;
-        }
-
-        auto pHexString = dynamic_cast<PDFHexStringElement*>(rElements[i].get());
-        if (pHexString)
-        {
-            if (!pArray)
-            {
-                rDictionary[aName] = pHexString;
-                if (pThisDictionary)
-                    pThisDictionary->SetKeyOffset(aName, nNameOffset);
-                aName.clear();
-            }
-            else
-            {
-                pArray->PushBack(pHexString);
-            }
-            continue;
-        }
-
-        if (dynamic_cast<PDFEndObjectElement*>(rElements[i].get()))
-            break;
-
-        // Just remember this, so that in case it's not a reference parameter,
-        // we can handle it later.
-        auto pNumber = dynamic_cast<PDFNumberElement*>(rElements[i].get());
-        if (pNumber)
-            aNumbers.push_back(pNumber);
-    }
-
-    if (!aNumbers.empty())
-    {
-        rDictionary[aName] = aNumbers.back();
-        if (pThisDictionary)
-            pThisDictionary->SetKeyOffset(aName, nNameOffset);
-        aName.clear();
-        aNumbers.clear();
-    }
-
-    pThis->setParsing(false);
-
-    return nRet;
-}
-
 PDFElement* PDFDictionaryElement::Lookup(const std::map<OString, PDFElement*>& rDictionary,
                                          const OString& rKey)
 {
@@ -2642,21 +2287,30 @@ PDFElement* PDFDictionaryElement::LookupElement(const OString& rDictionaryKey)
 
 void PDFObjectElement::parseIfNecessary()
 {
-    if (m_aDictionary.empty())
+    if (!m_bParsed)
     {
         if (!m_aElements.empty())
+        {
             // This is a stored object in an object stream.
-            PDFDictionaryElement::Parse(m_aElements, this, m_aDictionary);
+            PDFObjectParser aParser(m_aElements);
+            aParser.parse(this);
+        }
         else
+        {
             // Normal object: elements are stored as members of the document itself.
-            PDFDictionaryElement::Parse(m_rDoc.GetElements(), this, m_aDictionary);
+            PDFObjectParser aParser(m_rDoc.GetElements());
+            aParser.parse(this);
+        }
+        m_bParsed = true;
     }
 }
 
 PDFElement* PDFObjectElement::Lookup(const OString& rDictionaryKey)
 {
     parseIfNecessary();
-    return PDFDictionaryElement::Lookup(m_aDictionary, rDictionaryKey);
+    if (!m_pDictionaryElement)
+        return nullptr;
+    return PDFDictionaryElement::Lookup(GetDictionaryItems(), rDictionaryKey);
 }
 
 PDFObjectElement* PDFObjectElement::LookupObject(const OString& rDictionaryKey)
@@ -2765,7 +2419,7 @@ void PDFObjectElement::AddDictionaryReference(PDFReferenceElement* pReference)
 const std::map<OString, PDFElement*>& PDFObjectElement::GetDictionaryItems()
 {
     parseIfNecessary();
-    return m_aDictionary;
+    return m_pDictionaryElement->GetItems();
 }
 
 void PDFObjectElement::SetArray(PDFArrayElement* pArrayElement) { m_pArrayElement = pArrayElement; }
@@ -2777,7 +2431,11 @@ void PDFObjectElement::SetStream(PDFStreamElement* pStreamElement)
 
 PDFStreamElement* PDFObjectElement::GetStream() const { return m_pStreamElement; }
 
-PDFArrayElement* PDFObjectElement::GetArray() const { return m_pArrayElement; }
+PDFArrayElement* PDFObjectElement::GetArray()
+{
+    parseIfNecessary();
+    return m_pArrayElement;
+}
 
 void PDFObjectElement::ParseStoredObjects()
 {
@@ -3212,6 +2870,416 @@ bool PDFEndArrayElement::Read(SvStream& rStream)
 
 sal_uInt64 PDFEndArrayElement::GetOffset() const { return m_nOffset; }
 
+// PDFObjectParser
+
+size_t PDFObjectParser::parse(PDFElement* pParsingElement, size_t nStartIndex, int nCurrentDepth)
+{
+    // The index of last parsed element
+    size_t nReturnIndex = 0;
+
+    pParsingElement->setParsing(true);
+
+    comphelper::ScopeGuard aGuard([pParsingElement]() { pParsingElement->setParsing(false); });
+
+    // Current object, if root is an object, else nullptr
+    auto pParsingObject = dynamic_cast<PDFObjectElement*>(pParsingElement);
+    auto pParsingTrailer = dynamic_cast<PDFTrailerElement*>(pParsingElement);
+
+    // Current dictionary, if root is an dictionary, else nullptr
+    auto pParsingDictionary = dynamic_cast<PDFDictionaryElement*>(pParsingElement);
+
+    // Current parsing array, if root is an array, else nullptr
+    auto pParsingArray = dynamic_cast<PDFArrayElement*>(pParsingElement);
+
+    // Find out where the dictionary for this object starts.
+    size_t nIndex = nStartIndex;
+    for (size_t i = nStartIndex; i < mrElements.size(); ++i)
+    {
+        if (mrElements[i].get() == pParsingElement)
+        {
+            nIndex = i;
+            break;
+        }
+    }
+
+    OString aName;
+    sal_uInt64 nNameOffset = 0;
+    std::vector<PDFNumberElement*> aNumbers;
+
+    sal_uInt64 nDictionaryOffset = 0;
+
+    // Current depth; 1 is current
+    int nDepth = 0;
+
+    for (size_t i = nIndex; i < mrElements.size(); ++i)
+    {
+        auto* pCurrentElement = mrElements[i].get();
+
+        // Dictionary tokens can be nested, track enter/leave.
+        if (auto pCurrentDictionary = dynamic_cast<PDFDictionaryElement*>(pCurrentElement))
+        {
+            // Handle previously stored number
+            if (!aNumbers.empty())
+            {
+                if (pParsingDictionary)
+                {
+                    PDFNumberElement* pNumber = aNumbers.back();
+                    sal_uInt64 nLength
+                        = pNumber->GetLocation() + pNumber->GetLength() - nNameOffset;
+
+                    pParsingDictionary->insert(aName, pNumber);
+                    pParsingDictionary->SetKeyOffset(aName, nNameOffset);
+                    pParsingDictionary->SetKeyValueLength(aName, nLength);
+                }
+                else if (pParsingArray)
+                {
+                    for (auto& pNumber : aNumbers)
+                        pParsingArray->PushBack(pNumber);
+                }
+                else
+                {
+                    SAL_INFO("vcl.filter", "neither Dictionary nor Array available");
+                }
+                aName.clear();
+                aNumbers.clear();
+            }
+
+            nDepth++;
+
+            if (nDepth == 1) // pParsingDictionary is the current one
+            {
+                // First dictionary start, track start offset.
+                nDictionaryOffset = pCurrentDictionary->GetLocation();
+
+                if (pParsingObject)
+                {
+                    // Then the toplevel dictionary of the object.
+                    pParsingObject->SetDictionary(pCurrentDictionary);
+                    pParsingObject->SetDictionaryOffset(nDictionaryOffset);
+                    pParsingDictionary = pCurrentDictionary;
+                }
+                else if (pParsingTrailer)
+                {
+                    pParsingTrailer->SetDictionary(pCurrentDictionary);
+                    pParsingDictionary = pCurrentDictionary;
+                }
+            }
+            else if (!pCurrentDictionary->alreadyParsing())
+            {
+                if (pParsingArray)
+                {
+                    pParsingArray->PushBack(pCurrentDictionary);
+                }
+                else if (pParsingDictionary)
+                {
+                    // Dictionary toplevel value.
+                    pParsingDictionary->insert(aName, pCurrentDictionary);
+                }
+                else
+                {
+                    SAL_INFO("vcl.filter", "neither Dictionary nor Array available");
+                }
+                // Nested dictionary.
+                const size_t nNextElementIndex = parse(pCurrentDictionary, i, nCurrentDepth + 1);
+                i = std::max(i, nNextElementIndex - 1);
+            }
+        }
+        else if (auto pCurrentEndDictionary
+                 = dynamic_cast<PDFEndDictionaryElement*>(pCurrentElement))
+        {
+            // Handle previously stored number
+            if (!aNumbers.empty())
+            {
+                if (pParsingDictionary)
+                {
+                    PDFNumberElement* pNumber = aNumbers.back();
+                    sal_uInt64 nLength
+                        = pNumber->GetLocation() + pNumber->GetLength() - nNameOffset;
+
+                    pParsingDictionary->insert(aName, pNumber);
+                    pParsingDictionary->SetKeyOffset(aName, nNameOffset);
+                    pParsingDictionary->SetKeyValueLength(aName, nLength);
+                }
+                else if (pParsingArray)
+                {
+                    for (auto& pNumber : aNumbers)
+                        pParsingArray->PushBack(pNumber);
+                }
+                else
+                {
+                    SAL_INFO("vcl.filter", "neither Dictionary nor Array available");
+                }
+                aName.clear();
+                aNumbers.clear();
+            }
+
+            if (pParsingDictionary)
+            {
+                pParsingDictionary->SetKeyOffset(aName, nNameOffset);
+                sal_uInt64 nLength = pCurrentEndDictionary->GetLocation() - nNameOffset + 2;
+                pParsingDictionary->SetKeyValueLength(aName, nLength);
+                aName.clear();
+            }
+
+            if (nDepth == 1) // did the parsing ended
+            {
+                // Last dictionary end, track length and stop parsing.
+                if (pParsingObject)
+                {
+                    sal_uInt64 nDictionaryLength
+                        = pCurrentEndDictionary->GetLocation() - nDictionaryOffset;
+                    pParsingObject->SetDictionaryLength(nDictionaryLength);
+                }
+                nReturnIndex = i;
+                break;
+            }
+
+            nDepth--;
+        }
+        else if (auto pCurrentArray = dynamic_cast<PDFArrayElement*>(pCurrentElement))
+        {
+            // Handle previously stored number
+            if (!aNumbers.empty())
+            {
+                if (pParsingDictionary)
+                {
+                    PDFNumberElement* pNumber = aNumbers.back();
+
+                    sal_uInt64 nLength
+                        = pNumber->GetLocation() + pNumber->GetLength() - nNameOffset;
+                    pParsingDictionary->insert(aName, pNumber);
+                    pParsingDictionary->SetKeyOffset(aName, nNameOffset);
+                    pParsingDictionary->SetKeyValueLength(aName, nLength);
+                }
+                else if (pParsingArray)
+                {
+                    for (auto& pNumber : aNumbers)
+                        pParsingArray->PushBack(pNumber);
+                }
+                else
+                {
+                    SAL_INFO("vcl.filter", "neither Dictionary nor Array available");
+                }
+                aName.clear();
+                aNumbers.clear();
+            }
+
+            nDepth++;
+            if (nDepth == 1) // pParsingDictionary is the current one
+            {
+                if (pParsingObject)
+                {
+                    pParsingObject->SetArray(pCurrentArray);
+                    pParsingArray = pCurrentArray;
+                }
+            }
+            else if (!pCurrentArray->alreadyParsing())
+            {
+                if (pParsingArray)
+                {
+                    // Array is toplevel
+                    pParsingArray->PushBack(pCurrentArray);
+                }
+                else if (pParsingDictionary)
+                {
+                    // Dictionary toplevel value.
+                    pParsingDictionary->insert(aName, pCurrentArray);
+                }
+
+                const size_t nNextElementIndex = parse(pCurrentArray, i, nCurrentDepth + 1);
+
+                // ensure we go forwards and not endlessly loop
+                i = std::max(i, nNextElementIndex - 1);
+            }
+        }
+        else if (auto pCurrentEndArray = dynamic_cast<PDFEndArrayElement*>(pCurrentElement))
+        {
+            // Handle previously stored number
+            if (!aNumbers.empty())
+            {
+                if (pParsingDictionary)
+                {
+                    PDFNumberElement* pNumber = aNumbers.back();
+
+                    sal_uInt64 nLength
+                        = pNumber->GetLocation() + pNumber->GetLength() - nNameOffset;
+                    pParsingDictionary->insert(aName, pNumber);
+                    pParsingDictionary->SetKeyOffset(aName, nNameOffset);
+                    pParsingDictionary->SetKeyValueLength(aName, nLength);
+                }
+                else if (pParsingArray)
+                {
+                    for (auto& pNumber : aNumbers)
+                        pParsingArray->PushBack(pNumber);
+                }
+                else
+                {
+                    SAL_INFO("vcl.filter", "neither Dictionary nor Array available");
+                }
+                aName.clear();
+                aNumbers.clear();
+            }
+
+            if (nDepth == 1) // did the pParsing ended
+            {
+                // Last array end, track length and stop parsing.
+                nReturnIndex = i;
+                break;
+            }
+            else
+            {
+                if (pParsingDictionary)
+                {
+                    pParsingDictionary->SetKeyOffset(aName, nNameOffset);
+                    // Include the ending ']' in the length of the key - (array)value pair length.
+                    sal_uInt64 nLength = pCurrentEndArray->GetOffset() - nNameOffset + 1;
+                    pParsingDictionary->SetKeyValueLength(aName, nLength);
+                    aName.clear();
+                }
+            }
+            nDepth--;
+        }
+        else if (auto pCurrentName = dynamic_cast<PDFNameElement*>(pCurrentElement))
+        {
+            // Handle previously stored number
+            if (!aNumbers.empty())
+            {
+                if (pParsingDictionary)
+                {
+                    PDFNumberElement* pNumber = aNumbers.back();
+
+                    sal_uInt64 nLength
+                        = pNumber->GetLocation() + pNumber->GetLength() - nNameOffset;
+                    pParsingDictionary->insert(aName, pNumber);
+                    pParsingDictionary->SetKeyOffset(aName, nNameOffset);
+                    pParsingDictionary->SetKeyValueLength(aName, nLength);
+                }
+                else if (pParsingArray)
+                {
+                    for (auto& pNumber : aNumbers)
+                        pParsingArray->PushBack(pNumber);
+                }
+                aName.clear();
+                aNumbers.clear();
+            }
+
+            // Now handle name
+            if (pParsingArray)
+            {
+                // if we are in an array, just push the name to array
+                pParsingArray->PushBack(pCurrentName);
+            }
+            else if (pParsingDictionary)
+            {
+                // if we are in a dictionary, we need to store the name as a possible key
+                if (aName.isEmpty())
+                {
+                    aName = pCurrentName->GetValue();
+                    nNameOffset = pCurrentName->GetLocation();
+                }
+                else
+                {
+                    sal_uInt64 nKeyLength
+                        = pCurrentName->GetLocation() + pCurrentName->GetLength() - nNameOffset;
+                    pParsingDictionary->insert(aName, pCurrentName);
+                    pParsingDictionary->SetKeyOffset(aName, nNameOffset);
+                    pParsingDictionary->SetKeyValueLength(aName, nKeyLength);
+                    aName.clear();
+                }
+            }
+        }
+        else if (auto pReference = dynamic_cast<PDFReferenceElement*>(pCurrentElement))
+        {
+            if (pParsingArray)
+            {
+                pParsingArray->PushBack(pReference);
+            }
+            else if (pParsingDictionary)
+            {
+                sal_uInt64 nLength = pReference->GetOffset() - nNameOffset;
+                pParsingDictionary->insert(aName, pReference);
+                pParsingDictionary->SetKeyOffset(aName, nNameOffset);
+                pParsingDictionary->SetKeyValueLength(aName, nLength);
+                aName.clear();
+            }
+            else
+            {
+                SAL_INFO("vcl.filter", "neither Dictionary nor Array available");
+            }
+            aNumbers.clear();
+        }
+        else if (auto pLiteralString = dynamic_cast<PDFLiteralStringElement*>(pCurrentElement))
+        {
+            if (pParsingArray)
+            {
+                pParsingArray->PushBack(pLiteralString);
+            }
+            else if (pParsingDictionary)
+            {
+                pParsingDictionary->insert(aName, pLiteralString);
+                pParsingDictionary->SetKeyOffset(aName, nNameOffset);
+                aName.clear();
+            }
+            else
+            {
+                SAL_INFO("vcl.filter", "neither Dictionary nor Array available");
+            }
+        }
+        else if (auto pBoolean = dynamic_cast<PDFBooleanElement*>(pCurrentElement))
+        {
+            if (pParsingArray)
+            {
+                pParsingArray->PushBack(pBoolean);
+            }
+            else if (pParsingDictionary)
+            {
+                pParsingDictionary->insert(aName, pBoolean);
+                pParsingDictionary->SetKeyOffset(aName, nNameOffset);
+                aName.clear();
+            }
+            else
+            {
+                SAL_INFO("vcl.filter", "neither Dictionary nor Array available");
+            }
+        }
+        else if (auto pHexString = dynamic_cast<PDFHexStringElement*>(pCurrentElement))
+        {
+            if (pParsingArray)
+            {
+                pParsingArray->PushBack(pHexString);
+            }
+            else if (pParsingDictionary)
+            {
+                pParsingDictionary->insert(aName, pHexString);
+                pParsingDictionary->SetKeyOffset(aName, nNameOffset);
+                aName.clear();
+            }
+        }
+        else if (auto pNumberElement = dynamic_cast<PDFNumberElement*>(pCurrentElement))
+        {
+            // Just remember this, so that in case it's not a reference parameter,
+            // we can handle it later.
+            aNumbers.push_back(pNumberElement);
+        }
+        else if (dynamic_cast<PDFEndObjectElement*>(pCurrentElement))
+        {
+            // parsing of the object is finished
+            break;
+        }
+        else if (dynamic_cast<PDFObjectElement*>(pCurrentElement)
+                 || dynamic_cast<PDFTrailerElement*>(pCurrentElement))
+        {
+            continue;
+        }
+        else
+        {
+            SAL_INFO("vcl.filter", "Unhandeled element while parsing.");
+        }
+    }
+
+    return nReturnIndex;
+}
+
 } // namespace filter
 } // namespace vcl
 


More information about the Libreoffice-commits mailing list