[Libreoffice-commits] core.git: xmlsecurity/inc xmlsecurity/qa xmlsecurity/source

Miklos Vajna vmiklos at collabora.co.uk
Wed Nov 2 14:01:41 UTC 2016


 xmlsecurity/inc/pdfio/pdfdocument.hxx              |   50 ++
 xmlsecurity/qa/unit/pdfsigning/data/pdf16adobe.pdf |binary
 xmlsecurity/qa/unit/pdfsigning/pdfsigning.cxx      |   14 
 xmlsecurity/source/pdfio/pdfdocument.cxx           |  439 +++++++++++++++------
 4 files changed, 382 insertions(+), 121 deletions(-)

New commits:
commit b0d1a39e995871ef81cb58e8f1587a771fdd2deb
Author: Miklos Vajna <vmiklos at collabora.co.uk>
Date:   Wed Nov 2 11:10:35 2016 +0100

    xmlsecurity PDF verify: add support for object streams
    
    Adobe Acrobat uses object streams (PDF 1.6) when it signs a PDF exported
    from LO (PDF 1.4), with this we can verify that signature.
    
    If the PDF had at least one signature in LO, then the doc is not
    upgraded from PDF 1.4, so that was working already.
    
    Change-Id: I54b4447ca965a8ba1ffc69bde228ab6f0bda59ee

diff --git a/xmlsecurity/inc/pdfio/pdfdocument.hxx b/xmlsecurity/inc/pdfio/pdfdocument.hxx
index 95663e6c..37457c0 100644
--- a/xmlsecurity/inc/pdfio/pdfdocument.hxx
+++ b/xmlsecurity/inc/pdfio/pdfdocument.hxx
@@ -45,7 +45,40 @@ enum class TokenizeMode
     /// Till the first %%EOF token.
     EOF_TOKEN,
     /// Till the end of the current object.
-    END_OF_OBJECT
+    END_OF_OBJECT,
+    /// Same as END_OF_OBJECT, but for object streams (no endobj keyword).
+    STORED_OBJECT
+};
+
+/// The type column of an entry in a cross-reference stream.
+enum class XRefEntryType
+{
+    /// xref "n" or xref stream "1".
+    NOT_COMPRESSED,
+    /// xref stream "2.
+    COMPRESSED
+};
+
+/// An entry in a cross-reference stream.
+struct XRefEntry
+{
+    XRefEntryType m_eType;
+    /**
+     * Non-compressed: The byte offset of the object, starting from the
+     * beginning of the file.
+     * Compressed: The object number of the object stream in which this object is
+     * stored.
+     */
+    sal_uInt64 m_nOffset;
+    /**
+     * Non-compressed: The generation number of the object.
+     * Compressed: The index of this object within the object stream.
+     */
+    sal_uInt64 m_nGenerationNumber;
+    /// Are changed as part of an incremental update?.
+    bool m_bDirty;
+
+    XRefEntry();
 };
 
 /**
@@ -60,9 +93,7 @@ class XMLSECURITY_DLLPUBLIC PDFDocument
     /// This vector owns all elements.
     std::vector< std::unique_ptr<PDFElement> > m_aElements;
     /// Object ID <-> object offset map.
-    std::map<size_t, size_t> m_aXRef;
-    /// Object ID <-> "are changed as part of an incremental update?" map.
-    std::map<size_t, bool> m_aXRefDirty;
+    std::map<size_t, XRefEntry> m_aXRef;
     /// Object offset <-> Object pointer map.
     std::map<size_t, PDFObjectElement*> m_aOffsetObjects;
     /// Object ID <-> Object pointer map.
@@ -80,8 +111,6 @@ class XMLSECURITY_DLLPUBLIC PDFDocument
     static int AsHex(char ch);
     /// Decode a hex dump.
     static std::vector<unsigned char> DecodeHexString(PDFHexStringElement* pElement);
-    /// Tokenize elements from current offset.
-    bool Tokenize(SvStream& rStream, TokenizeMode eMode);
 
 public:
     PDFDocument();
@@ -99,7 +128,14 @@ public:
     std::vector<PDFObjectElement*> GetPages();
     /// Remember the end location of an EOF token.
     void PushBackEOF(size_t nOffset);
-    const std::map<size_t, PDFObjectElement*>& GetIDObjects() const;
+    /// Look up object based on object number, possibly by parsing object streams.
+    PDFObjectElement* LookupObject(size_t nObjectNumber);
+    /// Access to the input document, even after the inpust ream is gone.
+    SvMemoryStream& GetEditBuffer();
+    /// Tokenize elements from current offset.
+    bool Tokenize(SvStream& rStream, TokenizeMode eMode, std::vector< std::unique_ptr<PDFElement> >& rElements, PDFObjectElement* pObject);
+    /// Register an object (owned directly or indirectly by m_aElements) as a provder for a given ID.
+    void SetIDObject(size_t nID, PDFObjectElement* pObject);
 
     /// Read elements from the start of the stream till its end.
     bool Read(SvStream& rStream);
diff --git a/xmlsecurity/qa/unit/pdfsigning/data/pdf16adobe.pdf b/xmlsecurity/qa/unit/pdfsigning/data/pdf16adobe.pdf
new file mode 100644
index 0000000..ac1c5f3
Binary files /dev/null and b/xmlsecurity/qa/unit/pdfsigning/data/pdf16adobe.pdf differ
diff --git a/xmlsecurity/qa/unit/pdfsigning/pdfsigning.cxx b/xmlsecurity/qa/unit/pdfsigning/pdfsigning.cxx
index 469ded6..2f7ef57 100644
--- a/xmlsecurity/qa/unit/pdfsigning/pdfsigning.cxx
+++ b/xmlsecurity/qa/unit/pdfsigning/pdfsigning.cxx
@@ -57,6 +57,8 @@ public:
     void testPDFRemoveAll();
     /// Test a PDF 1.4 document, signed by Adobe.
     void testPDF14Adobe();
+    /// Test a PDF 1.6 document, signed by Adobe.
+    void testPDF16Adobe();
 
     CPPUNIT_TEST_SUITE(PDFSigningTest);
     CPPUNIT_TEST(testPDFAdd);
@@ -64,6 +66,7 @@ public:
     CPPUNIT_TEST(testPDFRemove);
     CPPUNIT_TEST(testPDFRemoveAll);
     CPPUNIT_TEST(testPDF14Adobe);
+    CPPUNIT_TEST(testPDF16Adobe);
     CPPUNIT_TEST_SUITE_END();
 };
 
@@ -254,6 +257,17 @@ void PDFSigningTest::testPDF14Adobe()
 #endif
 }
 
+void PDFSigningTest::testPDF16Adobe()
+{
+#ifndef _WIN32
+    // Contains a cross-reference stream, object streams and a compressed
+    // stream with a predictor. And a valid signature.
+    // Found signatures was 0, as parsing failed due to lack of support for
+    // these features.
+    verify(m_directories.getURLFromSrc(DATA_DIRECTORY) + "pdf16adobe.pdf", 1);
+#endif
+}
+
 CPPUNIT_TEST_SUITE_REGISTRATION(PDFSigningTest);
 
 CPPUNIT_PLUGIN_IMPLEMENT();
diff --git a/xmlsecurity/source/pdfio/pdfdocument.cxx b/xmlsecurity/source/pdfio/pdfdocument.cxx
index b690b5d..894247f 100644
--- a/xmlsecurity/source/pdfio/pdfdocument.cxx
+++ b/xmlsecurity/source/pdfio/pdfdocument.cxx
@@ -78,6 +78,7 @@ public:
 class PDFReferenceElement;
 class PDFDictionaryElement;
 class PDFArrayElement;
+class PDFStreamElement;
 
 /// Indirect object: something with a unique ID.
 class PDFObjectElement : public PDFElement
@@ -93,6 +94,12 @@ class PDFObjectElement : public PDFElement
     PDFDictionaryElement* m_pDictionaryElement;
     /// The contained direct array, if any.
     PDFArrayElement* m_pArrayElement;
+    /// The stream of this object, used when this is an object stream.
+    PDFStreamElement* m_pStreamElement;
+    /// Objects of an object stream.
+    std::vector< std::unique_ptr<PDFObjectElement> > m_aStoredElements;
+    /// Elements of an object in an object stream.
+    std::vector< std::unique_ptr<PDFElement> > m_aElements;
 
 public:
     PDFObjectElement(PDFDocument& rDoc, double fObjectValue, double fGenerationValue);
@@ -107,7 +114,11 @@ public:
     PDFDictionaryElement* GetDictionary() const;
     void SetDictionary(PDFDictionaryElement* pDictionaryElement);
     void SetArray(PDFArrayElement* pArrayElement);
+    void SetStream(PDFStreamElement* pStreamElement);
     PDFArrayElement* GetArray() const;
+    /// Parse objects stored in this object stream.
+    void ParseStoredObjects();
+    std::vector< std::unique_ptr<PDFElement> >& GetStoredElements();
 };
 
 /// Dictionary object: a set key-value pairs.
@@ -175,7 +186,7 @@ public:
     /// Assuming the reference points to a number object, return its value.
     double LookupNumber(SvStream& rStream) const;
     /// Lookup referenced object, without assuming anything about its contents.
-    PDFObjectElement* LookupObject() const;
+    PDFObjectElement* LookupObject();
     int GetObjectValue() const;
     int GetGenerationValue() const;
 };
@@ -275,6 +286,14 @@ public:
     PDFElement* Lookup(const OString& rDictionaryKey);
 };
 
+XRefEntry::XRefEntry()
+    : m_eType(XRefEntryType::NOT_COMPRESSED),
+      m_nOffset(0),
+      m_nGenerationNumber(0),
+      m_bDirty(false)
+{
+}
+
 PDFDocument::PDFDocument()
     : m_pTrailer(nullptr),
       m_pXRefStream(nullptr)
@@ -315,14 +334,15 @@ bool PDFDocument::Sign(const uno::Reference<security::XCertificate>& xCertificat
 
     // Write signature object.
     sal_Int32 nSignatureId = m_aXRef.size();
-    sal_uInt64 nSignatureOffset = m_aEditBuffer.Tell();
-    m_aXRef[nSignatureId] = nSignatureOffset;
-    m_aXRefDirty[nSignatureId] = true;
+    XRefEntry aSignatureEntry;
+    aSignatureEntry.m_nOffset = m_aEditBuffer.Tell();
+    aSignatureEntry.m_bDirty = true;
+    m_aXRef[nSignatureId] = aSignatureEntry;
     OStringBuffer aSigBuffer;
     aSigBuffer.append(nSignatureId);
     aSigBuffer.append(" 0 obj\n");
     aSigBuffer.append("<</Contents <");
-    sal_Int64 nSignatureContentOffset = nSignatureOffset + aSigBuffer.getLength();
+    sal_Int64 nSignatureContentOffset = aSignatureEntry.m_nOffset + aSigBuffer.getLength();
     // Reserve space for the PKCS#7 object.
     const int MAX_SIGNATURE_CONTENT_LENGTH = 50000;
     OStringBuffer aContentFiller(MAX_SIGNATURE_CONTENT_LENGTH);
@@ -337,7 +357,7 @@ bool PDFDocument::Sign(const uno::Reference<security::XCertificate>& xCertificat
     aSigBuffer.append(" ");
     aSigBuffer.append(nSignatureContentOffset + MAX_SIGNATURE_CONTENT_LENGTH + 1);
     aSigBuffer.append(" ");
-    sal_uInt64 nSignatureLastByteRangeOffset = nSignatureOffset + aSigBuffer.getLength();
+    sal_uInt64 nSignatureLastByteRangeOffset = aSignatureEntry.m_nOffset + aSigBuffer.getLength();
     // We don't know how many bytes we need for the last ByteRange value, this
     // should be enough.
     OStringBuffer aByteRangeFiller;
@@ -358,8 +378,10 @@ bool PDFDocument::Sign(const uno::Reference<security::XCertificate>& xCertificat
 
     // Write appearance object.
     sal_Int32 nAppearanceId = m_aXRef.size();
-    m_aXRef[nAppearanceId] = m_aEditBuffer.Tell();
-    m_aXRefDirty[nAppearanceId] = true;
+    XRefEntry aAppearanceEntry;
+    aAppearanceEntry.m_nOffset = m_aEditBuffer.Tell();
+    aAppearanceEntry.m_bDirty = true;
+    m_aXRef[nAppearanceId] = aAppearanceEntry;
     m_aEditBuffer.WriteUInt32AsString(nAppearanceId);
     m_aEditBuffer.WriteCharPtr(" 0 obj\n");
     m_aEditBuffer.WriteCharPtr("<</Type/XObject\n/Subtype/Form\n");
@@ -368,8 +390,10 @@ bool PDFDocument::Sign(const uno::Reference<security::XCertificate>& xCertificat
 
     // Write the Annot object, references nSignatureId and nAppearanceId.
     sal_Int32 nAnnotId = m_aXRef.size();
-    m_aXRef[nAnnotId] = m_aEditBuffer.Tell();
-    m_aXRefDirty[nAnnotId] = true;
+    XRefEntry aAnnotEntry;
+    aAnnotEntry.m_nOffset = m_aEditBuffer.Tell();
+    aAnnotEntry.m_bDirty = true;
+    m_aXRef[nAnnotId] = aAnnotEntry;
     m_aEditBuffer.WriteUInt32AsString(nAnnotId);
     m_aEditBuffer.WriteCharPtr(" 0 obj\n");
     m_aEditBuffer.WriteCharPtr("<</Type/Annot/Subtype/Widget/F 132\n");
@@ -406,8 +430,8 @@ bool PDFDocument::Sign(const uno::Reference<security::XCertificate>& xCertificat
         SAL_WARN("xmlsecurity.pdfio", "PDFDocument::Sign: invalid first page obj id");
         return false;
     }
-    m_aXRef[nFirstPageId] = m_aEditBuffer.Tell();
-    m_aXRefDirty[nFirstPageId] = true;
+    m_aXRef[nFirstPageId].m_nOffset = m_aEditBuffer.Tell();
+    m_aXRef[nFirstPageId].m_bDirty = true;
     m_aEditBuffer.WriteUInt32AsString(nFirstPageId);
     m_aEditBuffer.WriteCharPtr(" 0 obj\n");
     m_aEditBuffer.WriteCharPtr("<<");
@@ -459,8 +483,8 @@ bool PDFDocument::Sign(const uno::Reference<security::XCertificate>& xCertificat
         SAL_WARN("xmlsecurity.pdfio", "PDFDocument::Sign: invalid catalog obj id");
         return false;
     }
-    m_aXRef[nCatalogId] = m_aEditBuffer.Tell();
-    m_aXRefDirty[nCatalogId] = true;
+    m_aXRef[nCatalogId].m_nOffset = m_aEditBuffer.Tell();
+    m_aXRef[nCatalogId].m_bDirty = true;
     m_aEditBuffer.WriteUInt32AsString(nCatalogId);
     m_aEditBuffer.WriteCharPtr(" 0 obj\n");
     m_aEditBuffer.WriteCharPtr("<<");
@@ -510,8 +534,8 @@ bool PDFDocument::Sign(const uno::Reference<security::XCertificate>& xCertificat
     for (const auto& rXRef : m_aXRef)
     {
         size_t nObject = rXRef.first;
-        size_t nOffset = rXRef.second;
-        if (!m_aXRefDirty[nObject])
+        size_t nOffset = rXRef.second.m_nOffset;
+        if (!rXRef.second.m_bDirty)
             continue;
 
         m_aEditBuffer.WriteUInt32AsString(nObject);
@@ -632,13 +656,13 @@ bool PDFDocument::Write(SvStream& rStream)
     return rStream.good();
 }
 
-bool PDFDocument::Tokenize(SvStream& rStream, TokenizeMode eMode)
+bool PDFDocument::Tokenize(SvStream& rStream, TokenizeMode eMode, std::vector< std::unique_ptr<PDFElement> >& rElements, PDFObjectElement* pObjectElement)
 {
+    // Last seen object token.
+    PDFObjectElement* pObject = pObjectElement;
     bool bInXRef = false;
     // The next number will be an xref offset.
     bool bInStartXRef = false;
-    // Last seen object token.
-    PDFObjectElement* pObject = nullptr;
     // Dictionary depth, so we know when we're outside any dictionaries.
     int nDictionaryDepth = 0;
     // Last seen array token that's outside any dictionaries.
@@ -655,9 +679,9 @@ bool PDFDocument::Tokenize(SvStream& rStream, TokenizeMode eMode)
         case '%':
         {
             auto pComment = new PDFCommentElement(*this);
-            m_aElements.push_back(std::unique_ptr<PDFElement>(pComment));
+            rElements.push_back(std::unique_ptr<PDFElement>(pComment));
             rStream.SeekRel(-1);
-            if (!m_aElements.back()->Read(rStream))
+            if (!rElements.back()->Read(rStream))
                 return false;
             if (eMode == TokenizeMode::EOF_TOKEN && !m_aEOFs.empty() && m_aEOFs.back() == rStream.Tell())
             {
@@ -673,28 +697,28 @@ bool PDFDocument::Tokenize(SvStream& rStream, TokenizeMode eMode)
             rStream.SeekRel(-2);
             if (ch == '<')
             {
-                m_aElements.push_back(std::unique_ptr<PDFElement>(new PDFDictionaryElement()));
+                rElements.push_back(std::unique_ptr<PDFElement>(new PDFDictionaryElement()));
                 ++nDictionaryDepth;
             }
             else
-                m_aElements.push_back(std::unique_ptr<PDFElement>(new PDFHexStringElement()));
-            if (!m_aElements.back()->Read(rStream))
+                rElements.push_back(std::unique_ptr<PDFElement>(new PDFHexStringElement()));
+            if (!rElements.back()->Read(rStream))
                 return false;
             break;
         }
         case '>':
         {
-            m_aElements.push_back(std::unique_ptr<PDFElement>(new PDFEndDictionaryElement()));
+            rElements.push_back(std::unique_ptr<PDFElement>(new PDFEndDictionaryElement()));
             --nDictionaryDepth;
             rStream.SeekRel(-1);
-            if (!m_aElements.back()->Read(rStream))
+            if (!rElements.back()->Read(rStream))
                 return false;
             break;
         }
         case '[':
         {
             auto pArr = new PDFArrayElement();
-            m_aElements.push_back(std::unique_ptr<PDFElement>(pArr));
+            rElements.push_back(std::unique_ptr<PDFElement>(pArr));
             if (nDictionaryDepth == 0)
             {
                 // The array is attached directly, inform the object.
@@ -703,32 +727,32 @@ bool PDFDocument::Tokenize(SvStream& rStream, TokenizeMode eMode)
                     pObject->SetArray(pArray);
             }
             rStream.SeekRel(-1);
-            if (!m_aElements.back()->Read(rStream))
+            if (!rElements.back()->Read(rStream))
                 return false;
             break;
         }
         case ']':
         {
-            m_aElements.push_back(std::unique_ptr<PDFElement>(new PDFEndArrayElement()));
+            rElements.push_back(std::unique_ptr<PDFElement>(new PDFEndArrayElement()));
             pArray = nullptr;
             rStream.SeekRel(-1);
-            if (!m_aElements.back()->Read(rStream))
+            if (!rElements.back()->Read(rStream))
                 return false;
             break;
         }
         case '/':
         {
-            m_aElements.push_back(std::unique_ptr<PDFElement>(new PDFNameElement()));
+            rElements.push_back(std::unique_ptr<PDFElement>(new PDFNameElement()));
             rStream.SeekRel(-1);
-            if (!m_aElements.back()->Read(rStream))
+            if (!rElements.back()->Read(rStream))
                 return false;
             break;
         }
         case '(':
         {
-            m_aElements.push_back(std::unique_ptr<PDFElement>(new PDFLiteralStringElement()));
+            rElements.push_back(std::unique_ptr<PDFElement>(new PDFLiteralStringElement()));
             rStream.SeekRel(-1);
-            if (!m_aElements.back()->Read(rStream))
+            if (!rElements.back()->Read(rStream))
                 return false;
             break;
         }
@@ -738,7 +762,7 @@ bool PDFDocument::Tokenize(SvStream& rStream, TokenizeMode eMode)
             {
                 // Numbering object: an integer or a real.
                 PDFNumberElement* pNumberElement = new PDFNumberElement();
-                m_aElements.push_back(std::unique_ptr<PDFElement>(pNumberElement));
+                rElements.push_back(std::unique_ptr<PDFElement>(pNumberElement));
                 rStream.SeekRel(-1);
                 if (!pNumberElement->Read(rStream))
                     return false;
@@ -761,15 +785,15 @@ bool PDFDocument::Tokenize(SvStream& rStream, TokenizeMode eMode)
                 bool bObj = aKeyword == "obj";
                 if (bObj || aKeyword == "R")
                 {
-                    size_t nElements = m_aElements.size();
+                    size_t nElements = rElements.size();
                     if (nElements < 2)
                     {
                         SAL_WARN("xmlsecurity.pdfio", "PDFDocument::Tokenize: expected at least two tokens before 'obj' or 'R' keyword");
                         return false;
                     }
 
-                    auto pObjectNumber = dynamic_cast<PDFNumberElement*>(m_aElements[nElements - 2].get());
-                    auto pGenerationNumber = dynamic_cast<PDFNumberElement*>(m_aElements[nElements - 1].get());
+                    auto pObjectNumber = dynamic_cast<PDFNumberElement*>(rElements[nElements - 2].get());
+                    auto pGenerationNumber = dynamic_cast<PDFNumberElement*>(rElements[nElements - 1].get());
                     if (!pObjectNumber || !pGenerationNumber)
                     {
                         SAL_WARN("xmlsecurity.pdfio", "PDFDocument::Tokenize: missing object or generation number before 'obj' or 'R' keyword");
@@ -779,34 +803,34 @@ bool PDFDocument::Tokenize(SvStream& rStream, TokenizeMode eMode)
                     if (bObj)
                     {
                         pObject = new PDFObjectElement(*this, pObjectNumber->GetValue(), pGenerationNumber->GetValue());
-                        m_aElements.push_back(std::unique_ptr<PDFElement>(pObject));
+                        rElements.push_back(std::unique_ptr<PDFElement>(pObject));
                         m_aOffsetObjects[pObjectNumber->GetLocation()] = pObject;
                         m_aIDObjects[pObjectNumber->GetValue()] = pObject;
                     }
                     else
                     {
-                        m_aElements.push_back(std::unique_ptr<PDFElement>(new PDFReferenceElement(*this, pObjectNumber->GetValue(), pGenerationNumber->GetValue())));
+                        rElements.push_back(std::unique_ptr<PDFElement>(new PDFReferenceElement(*this, pObjectNumber->GetValue(), pGenerationNumber->GetValue())));
                         if (pArray)
                             // Reference is part of a direct (non-dictionary) array, inform the array.
-                            pArray->PushBack(m_aElements.back().get());
+                            pArray->PushBack(rElements.back().get());
                     }
-                    if (!m_aElements.back()->Read(rStream))
+                    if (!rElements.back()->Read(rStream))
                         return false;
                 }
                 else if (aKeyword == "stream")
                 {
                     // Look up the length of the stream from the parent object's dictionary.
                     size_t nLength = 0;
-                    for (size_t nElement = 0; nElement < m_aElements.size(); ++nElement)
+                    for (size_t nElement = 0; nElement < rElements.size(); ++nElement)
                     {
                         // Iterate in reverse order.
-                        size_t nIndex = m_aElements.size() - nElement - 1;
-                        PDFElement* pElement = m_aElements[nIndex].get();
-                        auto pObjectElement = dynamic_cast<PDFObjectElement*>(pElement);
-                        if (!pObjectElement)
+                        size_t nIndex = rElements.size() - nElement - 1;
+                        PDFElement* pElement = rElements[nIndex].get();
+                        auto pObj = dynamic_cast<PDFObjectElement*>(pElement);
+                        if (!pObj)
                             continue;
 
-                        PDFElement* pLookup = pObjectElement->Lookup("Length");
+                        PDFElement* pLookup = pObj->Lookup("Length");
                         auto pReference = dynamic_cast<PDFReferenceElement*>(pLookup);
                         if (pReference)
                         {
@@ -828,20 +852,23 @@ bool PDFDocument::Tokenize(SvStream& rStream, TokenizeMode eMode)
                     }
 
                     PDFDocument::SkipLineBreaks(rStream);
-                    m_aElements.push_back(std::unique_ptr<PDFElement>(new PDFStreamElement(nLength)));
-                    if (!m_aElements.back()->Read(rStream))
+                    auto pStreamElement = new PDFStreamElement(nLength);
+                    if (pObject)
+                        pObject->SetStream(pStreamElement);
+                    rElements.push_back(std::unique_ptr<PDFElement>(pStreamElement));
+                    if (!rElements.back()->Read(rStream))
                         return false;
                 }
                 else if (aKeyword == "endstream")
                 {
-                    m_aElements.push_back(std::unique_ptr<PDFElement>(new PDFEndStreamElement()));
-                    if (!m_aElements.back()->Read(rStream))
+                    rElements.push_back(std::unique_ptr<PDFElement>(new PDFEndStreamElement()));
+                    if (!rElements.back()->Read(rStream))
                         return false;
                 }
                 else if (aKeyword == "endobj")
                 {
-                    m_aElements.push_back(std::unique_ptr<PDFElement>(new PDFEndObjectElement()));
-                    if (!m_aElements.back()->Read(rStream))
+                    rElements.push_back(std::unique_ptr<PDFElement>(new PDFEndObjectElement()));
+                    if (!rElements.back()->Read(rStream))
                         return false;
                     if (eMode == TokenizeMode::END_OF_OBJECT)
                     {
@@ -850,9 +877,9 @@ bool PDFDocument::Tokenize(SvStream& rStream, TokenizeMode eMode)
                     }
                 }
                 else if (aKeyword == "true" || aKeyword == "false")
-                    m_aElements.push_back(std::unique_ptr<PDFElement>(new PDFBooleanElement(aKeyword.toBoolean())));
+                    rElements.push_back(std::unique_ptr<PDFElement>(new PDFBooleanElement(aKeyword.toBoolean())));
                 else if (aKeyword == "null")
-                    m_aElements.push_back(std::unique_ptr<PDFElement>(new PDFNullElement()));
+                    rElements.push_back(std::unique_ptr<PDFElement>(new PDFNullElement()));
                 else if (aKeyword == "xref")
                     // Allow 'f' and 'n' keywords.
                     bInXRef = true;
@@ -862,7 +889,7 @@ bool PDFDocument::Tokenize(SvStream& rStream, TokenizeMode eMode)
                 else if (aKeyword == "trailer")
                 {
                     m_pTrailer = new PDFTrailerElement(*this);
-                    m_aElements.push_back(std::unique_ptr<PDFElement>(m_pTrailer));
+                    rElements.push_back(std::unique_ptr<PDFElement>(m_pTrailer));
                 }
                 else if (aKeyword == "startxref")
                 {
@@ -890,6 +917,11 @@ bool PDFDocument::Tokenize(SvStream& rStream, TokenizeMode eMode)
     return true;
 }
 
+void PDFDocument::SetIDObject(size_t nID, PDFObjectElement* pObject)
+{
+    m_aIDObjects[nID] = pObject;
+}
+
 bool PDFDocument::Read(SvStream& rStream)
 {
     // Check file magic.
@@ -917,16 +949,30 @@ bool PDFDocument::Read(SvStream& rStream)
     while (true)
     {
         rStream.Seek(nStartXRef);
-        ReadXRef(rStream);
-        if (!Tokenize(rStream, TokenizeMode::EOF_TOKEN))
+        OString aKeyword = ReadKeyword(rStream);
+        if (aKeyword.isEmpty())
+            ReadXRefStream(rStream);
+
+        else
         {
-            SAL_WARN("xmlsecurity.pdfio", "PDFDocument::Read: failed to tokenizer trailer after xref");
-            return false;
+            if (aKeyword != "xref")
+            {
+                SAL_WARN("xmlsecurity.pdfio", "PDFDocument::Read: xref is not the first keyword");
+                return false;
+            }
+            ReadXRef(rStream);
+            if (!Tokenize(rStream, TokenizeMode::EOF_TOKEN, m_aElements, nullptr))
+            {
+                SAL_WARN("xmlsecurity.pdfio", "PDFDocument::Read: failed to tokenizer trailer after xref");
+                return false;
+            }
         }
 
         PDFNumberElement* pPrev = nullptr;
         if (m_pTrailer)
             pPrev = dynamic_cast<PDFNumberElement*>(m_pTrailer->Lookup("Prev"));
+        else if (m_pXRefStream)
+            pPrev = dynamic_cast<PDFNumberElement*>(m_pXRefStream->Lookup("Prev"));
         if (pPrev)
             nStartXRef = pPrev->GetValue();
 
@@ -942,7 +988,7 @@ bool PDFDocument::Read(SvStream& rStream)
 
     // Then we can tokenize the stream.
     rStream.Seek(0);
-    return Tokenize(rStream, TokenizeMode::END_OF_STREAM);
+    return Tokenize(rStream, TokenizeMode::END_OF_STREAM, m_aElements, nullptr);
 }
 
 OString PDFDocument::ReadKeyword(SvStream& rStream)
@@ -997,7 +1043,7 @@ size_t PDFDocument::FindStartXRef(SvStream& rStream)
 void PDFDocument::ReadXRefStream(SvStream& rStream)
 {
     // Look up the stream length in the object dictionary.
-    if (!Tokenize(rStream, TokenizeMode::END_OF_OBJECT))
+    if (!Tokenize(rStream, TokenizeMode::END_OF_OBJECT, m_aElements, nullptr))
     {
         SAL_WARN("xmlsecurity.pdfio", "PDFDocument::ReadXRefStream: failed to read object");
         return;
@@ -1024,6 +1070,9 @@ void PDFDocument::ReadXRefStream(SvStream& rStream)
         return;
     }
 
+    // So that the Prev key can be looked up later.
+    m_pXRefStream = pObject;
+
     PDFElement* pLookup = pObject->Lookup("Length");
     auto pNumber = dynamic_cast<PDFNumberElement*>(pLookup);
     if (!pNumber)
@@ -1095,25 +1144,37 @@ void PDFDocument::ReadXRefStream(SvStream& rStream)
 
     // Look up the first and the last entry we need to read.
     auto pIndex = dynamic_cast<PDFArrayElement*>(pObject->Lookup("Index"));
+    size_t nFirstObject = 0;
+    size_t nNumberOfObjects = 0;
     if (!pIndex || pIndex->GetElements().size() < 2)
     {
-        SAL_WARN("xmlsecurity.pdfio", "PDFDocument::ReadXRefStream: Index not found or has < 2 elements");
-        return;
+        auto pSize = dynamic_cast<PDFNumberElement*>(pObject->Lookup("Size"));
+        if (pSize)
+            nNumberOfObjects = pSize->GetValue();
+        else
+        {
+            SAL_WARN("xmlsecurity.pdfio", "PDFDocument::ReadXRefStream: Index not found or has < 2 elements");
+            return;
+        }
     }
-
-    const std::vector<PDFElement*>& rIndexElements = pIndex->GetElements();
-    auto pFirstObject = dynamic_cast<PDFNumberElement*>(rIndexElements[0]);
-    if (!pFirstObject)
+    else
     {
-        SAL_WARN("xmlsecurity.pdfio", "PDFDocument::ReadXRefStream: Index has no first object");
-        return;
-    }
+        const std::vector<PDFElement*>& rIndexElements = pIndex->GetElements();
+        auto pFirstObject = dynamic_cast<PDFNumberElement*>(rIndexElements[0]);
+        if (!pFirstObject)
+        {
+            SAL_WARN("xmlsecurity.pdfio", "PDFDocument::ReadXRefStream: Index has no first object");
+            return;
+        }
+        nFirstObject = pFirstObject->GetValue();
 
-    auto pNumberOfObjects = dynamic_cast<PDFNumberElement*>(rIndexElements[1]);
-    if (!pNumberOfObjects)
-    {
-        SAL_WARN("xmlsecurity.pdfio", "PDFDocument::ReadXRefStream: Index has no number of objects");
-        return;
+        auto pNumberOfObjects = dynamic_cast<PDFNumberElement*>(rIndexElements[1]);
+        if (!pNumberOfObjects)
+        {
+            SAL_WARN("xmlsecurity.pdfio", "PDFDocument::ReadXRefStream: Index has no number of objects");
+            return;
+        }
+        nNumberOfObjects = pNumberOfObjects->GetValue();
     }
 
     // Look up the format of a single entry.
@@ -1145,15 +1206,14 @@ void PDFDocument::ReadXRefStream(SvStream& rStream)
         return;
     }
 
-    size_t nSize = pNumberOfObjects->GetValue();
     aStream.Seek(0);
     // This is the line as read from the stream.
     std::vector<unsigned char> aOrigLine(nLineLength);
     // This is the line as it appears after tweaking according to nPredictor.
     std::vector<unsigned char> aFilteredLine(nLineLength);
-    for (size_t nEntry = 0; nEntry < nSize; ++nEntry)
+    for (size_t nEntry = 0; nEntry < nNumberOfObjects; ++nEntry)
     {
-        size_t nIndex = pFirstObject->GetValue() + nEntry;
+        size_t nIndex = nFirstObject + nEntry;
 
         aStream.ReadBytes(aOrigLine.data(), aOrigLine.size());
         if (aOrigLine[0] + 10 != nPredictor)
@@ -1210,12 +1270,15 @@ void PDFDocument::ReadXRefStream(SvStream& rStream)
         }
 
         // "n" entry of the xref table
-        if (nType == 1)
+        if (nType == 1 || nType == 2)
         {
             if (m_aXRef.find(nIndex) == m_aXRef.end())
             {
-                m_aXRef[nIndex] = nStreamOffset;
-                m_aXRefDirty[nIndex] = false;
+                XRefEntry aEntry;
+                aEntry.m_eType = nType == 1 ? XRefEntryType::NOT_COMPRESSED : XRefEntryType::COMPRESSED;
+                aEntry.m_nOffset = nStreamOffset;
+                aEntry.m_nGenerationNumber = nGenerationNumber;
+                m_aXRef[nIndex] = aEntry;
             }
         }
     }
@@ -1223,19 +1286,6 @@ void PDFDocument::ReadXRefStream(SvStream& rStream)
 
 void PDFDocument::ReadXRef(SvStream& rStream)
 {
-    OString aKeyword = ReadKeyword(rStream);
-    if (aKeyword.isEmpty())
-    {
-        ReadXRefStream(rStream);
-        return;
-    }
-
-    if (aKeyword != "xref")
-    {
-        SAL_WARN("xmlsecurity.pdfio", "PDFDocument::ReadXRef: xref is not the first keyword");
-        return;
-    }
-
     PDFDocument::SkipWhitespace(rStream);
 
     while (true)
@@ -1288,7 +1338,7 @@ void PDFDocument::ReadXRef(SvStream& rStream)
             }
 
             PDFDocument::SkipWhitespace(rStream);
-            aKeyword = ReadKeyword(rStream);
+            OString aKeyword = ReadKeyword(rStream);
             if (aKeyword != "f" && aKeyword != "n")
             {
                 SAL_WARN("xmlsecurity.pdfio", "PDFDocument::ReadXRef: unexpected keyword");
@@ -1298,9 +1348,13 @@ void PDFDocument::ReadXRef(SvStream& rStream)
             // offset with an older one.
             if (m_aXRef.find(nIndex) == m_aXRef.end())
             {
-                m_aXRef[nIndex] = aOffset.GetValue();
+                XRefEntry aEntry;
+                aEntry.m_nOffset = aOffset.GetValue();
+                aEntry.m_nGenerationNumber = aGenerationNumber.GetValue();
                 // Initially only the first entry is dirty.
-                m_aXRefDirty[nIndex] = nIndex == 0;
+                if (nIndex == 0)
+                    aEntry.m_bDirty = true;
+                m_aXRef[nIndex] = aEntry;
             }
             PDFDocument::SkipWhitespace(rStream);
         }
@@ -1346,13 +1400,13 @@ void PDFDocument::SkipLineBreaks(SvStream& rStream)
 size_t PDFDocument::GetObjectOffset(size_t nIndex) const
 {
     auto it = m_aXRef.find(nIndex);
-    if (it == m_aXRef.end())
+    if (it == m_aXRef.end() || it->second.m_eType == XRefEntryType::COMPRESSED)
     {
         SAL_WARN("xmlsecurity.pdfio", "PDFDocument::GetObjectOffset: wanted to look up index #" << nIndex << ", but failed");
         return 0;
     }
 
-    return it->second;
+    return it->second.m_nOffset;
 }
 
 const std::vector< std::unique_ptr<PDFElement> >& PDFDocument::GetElements()
@@ -1360,11 +1414,6 @@ const std::vector< std::unique_ptr<PDFElement> >& PDFDocument::GetElements()
     return m_aElements;
 }
 
-const std::map<size_t, PDFObjectElement*>& PDFDocument::GetIDObjects() const
-{
-    return m_aIDObjects;
-}
-
 std::vector<PDFObjectElement*> PDFDocument::GetPages()
 {
     std::vector<PDFObjectElement*> aRet;
@@ -2011,7 +2060,8 @@ PDFObjectElement::PDFObjectElement(PDFDocument& rDoc, double fObjectValue, doubl
       m_nDictionaryOffset(0),
       m_nDictionaryLength(0),
       m_pDictionaryElement(nullptr),
-      m_pArrayElement(nullptr)
+      m_pArrayElement(nullptr),
+      m_pStreamElement(nullptr)
 {
 }
 
@@ -2236,7 +2286,14 @@ PDFElement* PDFDictionaryElement::Lookup(const std::map<OString, PDFElement*>& r
 PDFElement* PDFObjectElement::Lookup(const OString& rDictionaryKey)
 {
     if (m_aDictionary.empty())
-        PDFDictionaryElement::Parse(m_rDoc.GetElements(), this, m_aDictionary);
+    {
+        if (!m_aElements.empty())
+            // This is a stored object in an object stream.
+            PDFDictionaryElement::Parse(m_aElements, this, m_aDictionary);
+        else
+            // Normal object: elements are stored as members of the document itself.
+            PDFDictionaryElement::Parse(m_rDoc.GetElements(), this, m_aDictionary);
+    }
 
     return PDFDictionaryElement::Lookup(m_aDictionary, rDictionaryKey);
 }
@@ -2332,11 +2389,139 @@ void PDFObjectElement::SetArray(PDFArrayElement* pArrayElement)
     m_pArrayElement = pArrayElement;
 }
 
+void PDFObjectElement::SetStream(PDFStreamElement* pStreamElement)
+{
+    m_pStreamElement = pStreamElement;
+}
+
 PDFArrayElement* PDFObjectElement::GetArray() const
 {
     return m_pArrayElement;
 }
 
+void PDFObjectElement::ParseStoredObjects()
+{
+    if (!m_pStreamElement)
+    {
+        SAL_WARN("xmlsecurity.pdfio", "PDFObjectElement::ParseStoredObjects: no stream");
+        return;
+    }
+
+    auto pType = dynamic_cast<PDFNameElement*>(Lookup("Type"));
+    if (!pType || pType->GetValue() != "ObjStm")
+    {
+        SAL_WARN("xmlsecurity.pdfio", "PDFDocument::ReadXRefStream: missing or unexpected type: " << pType->GetValue());
+        return;
+    }
+
+    auto pFilter = dynamic_cast<PDFNameElement*>(Lookup("Filter"));
+    if (!pFilter || pFilter->GetValue() != "FlateDecode")
+    {
+        SAL_WARN("xmlsecurity.pdfio", "PDFDocument::ReadXRefStream: missing or unexpected filter");
+        return;
+    }
+
+    auto pFirst = dynamic_cast<PDFNumberElement*>(Lookup("First"));
+    if (!pFirst)
+    {
+        SAL_WARN("xmlsecurity.pdfio", "PDFObjectElement::ParseStoredObjects: no First");
+        return;
+    }
+
+    auto pN = dynamic_cast<PDFNumberElement*>(Lookup("N"));
+    if (!pN)
+    {
+        SAL_WARN("xmlsecurity.pdfio", "PDFObjectElement::ParseStoredObjects: no N");
+        return;
+    }
+    size_t nN = pN->GetValue();
+
+    auto pLength = dynamic_cast<PDFNumberElement*>(Lookup("Length"));
+    if (!pLength)
+    {
+        SAL_WARN("xmlsecurity.pdfio", "PDFObjectElement::ParseStoredObjects: no length");
+        return;
+    }
+    size_t nLength = pLength->GetValue();
+
+    // Read and decompress it.
+    SvMemoryStream& rEditBuffer = m_rDoc.GetEditBuffer();
+    rEditBuffer.Seek(m_pStreamElement->GetOffset());
+    std::vector<char> aBuf(nLength);
+    rEditBuffer.ReadBytes(aBuf.data(), aBuf.size());
+    SvMemoryStream aSource(aBuf.data(), aBuf.size(), StreamMode::READ);
+    SvMemoryStream aStream;
+    ZCodec aZCodec;
+    aZCodec.BeginCompression();
+    aZCodec.Decompress(aSource, aStream);
+    if (!aZCodec.EndCompression())
+    {
+        SAL_WARN("xmlsecurity.pdfio", "PDFObjectElement::ParseStoredObjects: decompression failed");
+        return;
+    }
+
+    aStream.Seek(STREAM_SEEK_TO_END);
+    nLength = aStream.Tell();
+    aStream.Seek(0);
+    std::vector<size_t> aObjNums;
+    std::vector<size_t> aOffsets;
+    std::vector<size_t> aLengths;
+    // First iterate over and find out the lengths.
+    for (size_t nObject = 0; nObject < nN; ++nObject)
+    {
+        PDFNumberElement aObjNum;
+        if (!aObjNum.Read(aStream))
+        {
+            SAL_WARN("xmlsecurity.pdfio", "PDFObjectElement::ParseStoredObjects: failed to read object number");
+            return;
+        }
+        aObjNums.push_back(aObjNum.GetValue());
+
+        PDFDocument::SkipWhitespace(aStream);
+
+        PDFNumberElement aByteOffset;
+        if (!aByteOffset.Read(aStream))
+        {
+            SAL_WARN("xmlsecurity.pdfio", "PDFObjectElement::ParseStoredObjects: failed to read byte offset");
+            return;
+        }
+        aOffsets.push_back(pFirst->GetValue() + aByteOffset.GetValue());
+
+        if (aOffsets.size() > 1)
+            aLengths.push_back(aOffsets.back() - aOffsets[aOffsets.size() - 2]);
+        if (nObject + 1 == nN)
+            aLengths.push_back(nLength - aOffsets.back());
+
+        PDFDocument::SkipWhitespace(aStream);
+    }
+
+    // Now create streams with the proper length and tokenize the data.
+    for (size_t nObject = 0; nObject < nN; ++nObject)
+    {
+        size_t nObjNum = aObjNums[nObject];
+        size_t nOffset = aOffsets[nObject];
+        size_t nLen = aLengths[nObject];
+
+        aStream.Seek(nOffset);
+        m_aStoredElements.push_back(std::unique_ptr<PDFObjectElement>(new PDFObjectElement(m_rDoc, nObjNum, 0)));
+        PDFObjectElement* pStored = m_aStoredElements.back().get();
+
+        aBuf.clear();
+        aBuf.resize(nLen);
+        aStream.ReadBytes(aBuf.data(), aBuf.size());
+        SvMemoryStream aStoredStream(aBuf.data(), aBuf.size(), StreamMode::READ);
+
+        m_rDoc.Tokenize(aStoredStream, TokenizeMode::STORED_OBJECT, pStored->GetStoredElements(), pStored);
+        // This is how references know the object is stored inside this object stream.
+        m_rDoc.SetIDObject(nObjNum, pStored);
+    }
+}
+
+std::vector< std::unique_ptr<PDFElement> >& PDFObjectElement::GetStoredElements()
+{
+    return m_aElements;
+}
+
 PDFReferenceElement::PDFReferenceElement(PDFDocument& rDoc, int fObjectValue, int fGenerationValue)
     : m_rDoc(rDoc),
       m_fObjectValue(fObjectValue),
@@ -2409,17 +2594,43 @@ double PDFReferenceElement::LookupNumber(SvStream& rStream) const
     return aNumber.GetValue();
 }
 
-PDFObjectElement* PDFReferenceElement::LookupObject() const
+PDFObjectElement* PDFReferenceElement::LookupObject()
 {
-    const std::map<size_t, PDFObjectElement*>& rIDObjects = m_rDoc.GetIDObjects();
-    auto it = rIDObjects.find(m_fObjectValue);
-    if (it != rIDObjects.end())
-        return it->second;
+    return m_rDoc.LookupObject(m_fObjectValue);
+}
 
-    SAL_WARN("xmlsecurity.pdfio", "PDFReferenceElement::LookupObject: can't find obj " << m_fObjectValue);
+PDFObjectElement* PDFDocument::LookupObject(size_t nObjectNumber)
+{
+    auto itIDObjects = m_aIDObjects.find(nObjectNumber);
+    auto itXRef = m_aXRef.find(nObjectNumber);
+    if (itIDObjects == m_aIDObjects.end() && itXRef != m_aXRef.end())
+    {
+        // We don't have an object for this number yet, but there is an xref
+        // entry for it.
+        const XRefEntry& rEntry = itXRef->second;
+        if (rEntry.m_eType == XRefEntryType::COMPRESSED)
+        {
+            // It's a compressed entry, try parsing the stored objects.
+            if (PDFObjectElement* pObjectStream = LookupObject(rEntry.m_nOffset))
+                // This registers new IDs.
+                pObjectStream->ParseStoredObjects();
+        }
+        // Find again, now that the new objects are registered.
+        itIDObjects = m_aIDObjects.find(nObjectNumber);
+    }
+
+    if (itIDObjects != m_aIDObjects.end())
+        return itIDObjects->second;
+
+    SAL_WARN("xmlsecurity.pdfio", "PDFDocument::LookupObject: can't find obj " << nObjectNumber);
     return nullptr;
 }
 
+SvMemoryStream& PDFDocument::GetEditBuffer()
+{
+    return m_aEditBuffer;
+}
+
 int PDFReferenceElement::GetObjectValue() const
 {
     return m_fObjectValue;


More information about the Libreoffice-commits mailing list