[Libreoffice-commits] core.git: vcl/inc vcl/Library_vcl.mk vcl/qa vcl/source

Tomaž Vajngerl (via logerrit) logerrit at kemper.freedesktop.org
Mon Nov 2 19:11:43 UTC 2020


 vcl/Library_vcl.mk                     |    1 
 vcl/inc/pdf/ExternalPDFStreams.hxx     |   70 +++++++++++++++++++++++++++++++++
 vcl/inc/pdf/objectcopier.hxx           |    3 +
 vcl/qa/cppunit/pdfexport/pdfexport.cxx |    2 
 vcl/source/gdi/pdfobjectcopier.cxx     |    7 ++-
 vcl/source/gdi/pdfwriter_impl.cxx      |   33 +++++++--------
 vcl/source/gdi/pdfwriter_impl.hxx      |   21 +++++++--
 vcl/source/pdf/ExternalPDFStreams.cxx  |   43 ++++++++++++++++++++
 8 files changed, 155 insertions(+), 25 deletions(-)

New commits:
commit c724c1bec549f224656e7ca3290494159dda7e26
Author:     Tomaž Vajngerl <tomaz.vajngerl at collabora.co.uk>
AuthorDate: Wed Oct 28 13:55:23 2020 +0100
Commit:     Tomaž Vajngerl <quikee at gmail.com>
CommitDate: Mon Nov 2 20:11:03 2020 +0100

    pdf: deduplicate resources when copying from external PDF stream
    
    When using external PDF stream/data (from PDF graphic objects),
    make sure to copy the content of external PDF resources (fonts,
    bitmaps, forms) only one time (by sharing the map between calls)
    and every other use, just use the reference to the objects.
    
    Change-Id: Ibaa632c8f74806eb195e69404551db6fd077a986
    Reviewed-on: https://gerrit.libreoffice.org/c/core/+/104935
    Tested-by: Jenkins CollaboraOffice <jenkinscollaboraoffice at gmail.com>
    Reviewed-by: Miklos Vajna <vmiklos at collabora.com>
    Reviewed-on: https://gerrit.libreoffice.org/c/core/+/105163
    Tested-by: Jenkins
    Reviewed-by: Tomaž Vajngerl <quikee at gmail.com>

diff --git a/vcl/Library_vcl.mk b/vcl/Library_vcl.mk
index ee68260fbd50..bc6b8f8f9af7 100644
--- a/vcl/Library_vcl.mk
+++ b/vcl/Library_vcl.mk
@@ -322,6 +322,7 @@ $(eval $(call gb_Library_add_exception_objects,vcl,\
     vcl/source/pdf/Matrix3 \
     vcl/source/pdf/XmpMetadata \
     vcl/source/pdf/PDFiumLibrary \
+    vcl/source/pdf/ExternalPDFStreams \
     vcl/source/graphic/GraphicID \
     vcl/source/graphic/GraphicLoader \
     vcl/source/graphic/GraphicObject \
diff --git a/vcl/inc/pdf/ExternalPDFStreams.hxx b/vcl/inc/pdf/ExternalPDFStreams.hxx
new file mode 100644
index 000000000000..3a9ea38bc9db
--- /dev/null
+++ b/vcl/inc/pdf/ExternalPDFStreams.hxx
@@ -0,0 +1,70 @@
+/* -*- Mode: C++; tab-width: 4; indent-tabs-mode: nil; c-basic-offset: 4 -*- */
+/*
+ * This file is part of the LibreOffice project.
+ *
+ * This Source Code Form is subject to the terms of the Mozilla Public
+ * License, v. 2.0. If a copy of the MPL was not distributed with this
+ * file, You can obtain one at http://mozilla.org/MPL/2.0/.
+ */
+
+#pragma once
+
+#include <sal/types.h>
+#include <sal/log.hxx>
+#include <vcl/dllapi.h>
+
+#include <map>
+#include <vector>
+#include <memory>
+
+#include <vcl/filter/pdfdocument.hxx>
+
+namespace vcl
+{
+// A external PDF stream, which stores the PDF stream data as byte array.
+// This struct is also responsible to parsing the stream as a PDFDocument,
+// and store its instance for the life-cycle of the struct, so that it
+// reused to avoid unneccesary parsing.
+struct VCL_DLLPUBLIC ExternalPDFStream
+{
+    std::vector<sal_uInt8> maData;
+    std::shared_ptr<filter::PDFDocument> mpPDFDocument;
+    std::map<sal_Int32, sal_Int32> maCopiedResources;
+
+    std::map<sal_Int32, sal_Int32>& getCopiedResources() { return maCopiedResources; }
+
+    filter::PDFDocument& getPDFDocument()
+    {
+        if (!mpPDFDocument)
+        {
+            SvMemoryStream aPDFStream;
+            aPDFStream.WriteBytes(maData.data(), maData.size());
+            aPDFStream.Seek(0);
+            mpPDFDocument = std::make_shared<filter::PDFDocument>();
+            if (!mpPDFDocument->Read(aPDFStream))
+            {
+                SAL_WARN("vcl.pdfwriter",
+                         "PDFWriterImpl::writeReferenceXObject: reading the PDF document failed");
+            }
+        }
+        return *mpPDFDocument;
+    }
+};
+
+// Class to manage external PDF streams, for the de-duplication purpuse.
+class VCL_DLLPUBLIC ExternalPDFStreams
+{
+private:
+    std::map<std::vector<sal_uInt8>, sal_Int32> maStreamIndexMap;
+    std::vector<ExternalPDFStream> maStreamList;
+
+public:
+    ExternalPDFStreams() {}
+
+    sal_Int32 store(const sal_uInt8* pData, sal_uInt32 nLength);
+
+    ExternalPDFStream& get(sal_uInt32 nIndex);
+};
+}
+
+/* vim:set shiftwidth=4 softtabstop=4 expandtab: */
diff --git a/vcl/inc/pdf/objectcopier.hxx b/vcl/inc/pdf/objectcopier.hxx
index 6e98ed0834f9..65dbbb49aef4 100644
--- a/vcl/inc/pdf/objectcopier.hxx
+++ b/vcl/inc/pdf/objectcopier.hxx
@@ -48,6 +48,9 @@ public:
     /// Copies resources of pPage into rLine.
     void copyPageResources(filter::PDFObjectElement* pPage, OStringBuffer& rLine);
 
+    void copyPageResources(filter::PDFObjectElement* pPage, OStringBuffer& rLine,
+                           std::map<sal_Int32, sal_Int32>& rCopiedResources);
+
     /// Copies page one or more page streams from rContentStreams into rStream.
     static sal_Int32 copyPageStreams(std::vector<filter::PDFObjectElement*>& rContentStreams,
                                      SvMemoryStream& rStream, bool& rCompressed);
diff --git a/vcl/qa/cppunit/pdfexport/pdfexport.cxx b/vcl/qa/cppunit/pdfexport/pdfexport.cxx
index e9027e7e0aad..762bdf0dbce1 100644
--- a/vcl/qa/cppunit/pdfexport/pdfexport.cxx
+++ b/vcl/qa/cppunit/pdfexport/pdfexport.cxx
@@ -2114,7 +2114,7 @@ CPPUNIT_TEST_FIXTURE(PdfExportTest, testMultiPagePDF)
     {   // embedded PDF page 2
         vcl::filter::PDFObjectElement* pXObject2 = pXObjects->LookupObject(rIDs[1]);
         CPPUNIT_ASSERT(pXObject2);
-        CPPUNIT_ASSERT_EQUAL(OString("Im34"), rIDs[1]);
+        CPPUNIT_ASSERT_EQUAL(OString("Im24"), rIDs[1]);
 
         auto pSubtype2 = dynamic_cast<vcl::filter::PDFNameElement*>(pXObject2->Lookup("Subtype"));
         CPPUNIT_ASSERT(pSubtype2);
diff --git a/vcl/source/gdi/pdfobjectcopier.cxx b/vcl/source/gdi/pdfobjectcopier.cxx
index 5e54ee68c289..a953c864c122 100644
--- a/vcl/source/gdi/pdfobjectcopier.cxx
+++ b/vcl/source/gdi/pdfobjectcopier.cxx
@@ -275,13 +275,18 @@ void PDFObjectCopier::copyPageResources(filter::PDFObjectElement* pPage, OString
 {
     // Maps from source object id (PDF image) to target object id (export result).
     std::map<sal_Int32, sal_Int32> aCopiedResources;
+    copyPageResources(pPage, rLine, aCopiedResources);
+}
 
+void PDFObjectCopier::copyPageResources(filter::PDFObjectElement* pPage, OStringBuffer& rLine,
+                                        std::map<sal_Int32, sal_Int32>& rCopiedResources)
+{
     rLine.append(" /Resources <<");
     static const std::initializer_list<OString> aKeys
         = { "ColorSpace", "ExtGState", "Font", "XObject", "Shading" };
     for (const auto& rKey : aKeys)
     {
-        rLine.append(copyExternalResources(*pPage, rKey, aCopiedResources));
+        rLine.append(copyExternalResources(*pPage, rKey, rCopiedResources));
     }
     rLine.append(">>");
 }
diff --git a/vcl/source/gdi/pdfwriter_impl.cxx b/vcl/source/gdi/pdfwriter_impl.cxx
index 1e7d8db238e1..11de3436e531 100644
--- a/vcl/source/gdi/pdfwriter_impl.cxx
+++ b/vcl/source/gdi/pdfwriter_impl.cxx
@@ -8358,7 +8358,7 @@ bool PDFWriterImpl::writeGradientFunction( GradientEmit const & rObject )
 
 void PDFWriterImpl::writeJPG( JPGEmit& rObject )
 {
-    if (!rObject.m_aReferenceXObject.m_aPDFData.empty() && !m_aContext.UseReferenceXObject)
+    if (rObject.m_aReferenceXObject.hasExternalPDFData() && !m_aContext.UseReferenceXObject)
     {
         writeReferenceXObject(rObject.m_aReferenceXObject);
         return;
@@ -8457,23 +8457,19 @@ void PDFWriterImpl::writeReferenceXObject(ReferenceXObjectEmit& rEmit)
     {
         // Parse the PDF data, we need that to write the PDF dictionary of our
         // object.
-        SvMemoryStream aPDFStream;
-        aPDFStream.WriteBytes(rEmit.m_aPDFData.data(), rEmit.m_aPDFData.size());
-        aPDFStream.Seek(0);
-        filter::PDFDocument aPDFDocument;
-        if (!aPDFDocument.Read(aPDFStream))
-        {
-            SAL_WARN("vcl.pdfwriter", "PDFWriterImpl::writeReferenceXObject: reading the PDF document failed");
+        if (rEmit.m_nExternalPDFDataIndex < 0)
             return;
-        }
-        std::vector<filter::PDFObjectElement*> aPages = aPDFDocument.GetPages();
+        auto & rExternalPDFStream = m_aExternalPDFStreams.get(rEmit.m_nExternalPDFDataIndex);
+        auto & rPDFDocument = rExternalPDFStream.getPDFDocument();
+
+        std::vector<filter::PDFObjectElement*> aPages = rPDFDocument.GetPages();
         if (aPages.empty())
         {
             SAL_WARN("vcl.pdfwriter", "PDFWriterImpl::writeReferenceXObject: no pages");
             return;
         }
 
-        size_t nPageIndex = rEmit.m_nPDFPageIndex >= 0 ? rEmit.m_nPDFPageIndex : 0;
+        size_t nPageIndex = rEmit.m_nExternalPDFPageIndex >= 0 ? rEmit.m_nExternalPDFPageIndex : 0;
 
         filter::PDFObjectElement* pPage = aPages[nPageIndex];
         if (!pPage)
@@ -8545,7 +8541,9 @@ void PDFWriterImpl::writeReferenceXObject(ReferenceXObjectEmit& rEmit)
         }
 
         PDFObjectCopier aCopier(*this);
-        aCopier.copyPageResources(pPage, aLine);
+        auto & rResources = rExternalPDFStream.getCopiedResources();
+        aCopier.copyPageResources(pPage, aLine, rResources);
+
         aLine.append(" /BBox [ 0 0 ");
         aLine.append(nWidth);
         aLine.append(" ");
@@ -8687,7 +8685,7 @@ namespace
 
 bool PDFWriterImpl::writeBitmapObject( BitmapEmit& rObject, bool bMask )
 {
-    if (!rObject.m_aReferenceXObject.m_aPDFData.empty() && !m_aContext.UseReferenceXObject)
+    if (rObject.m_aReferenceXObject.hasExternalPDFData() && !m_aContext.UseReferenceXObject)
     {
         writeReferenceXObject(rObject.m_aReferenceXObject);
         return true;
@@ -9009,10 +9007,10 @@ void PDFWriterImpl::createEmbeddedFile(const Graphic& rGraphic, ReferenceXObject
     sal_uInt32 nLength = rGraphic.getVectorGraphicData()->getVectorGraphicDataArrayLength();
     auto const & rArray = rGraphic.getVectorGraphicData()->getVectorGraphicDataArray();
 
-    auto pPDFData = std::make_shared<std::vector<sal_Int8>>(rArray.getConstArray(), rArray.getConstArray() + nLength);
-
     if (m_aContext.UseReferenceXObject)
     {
+        auto pPDFData = std::make_shared<std::vector<sal_Int8>>(rArray.getConstArray(), rArray.getConstArray() + nLength);
+
         // Store the original PDF data as an embedded file.
         m_aEmbeddedFiles.emplace_back();
         m_aEmbeddedFiles.back().m_nObject = createObject();
@@ -9021,8 +9019,9 @@ void PDFWriterImpl::createEmbeddedFile(const Graphic& rGraphic, ReferenceXObject
     }
     else
     {
-        rEmit.m_nPDFPageIndex = rGraphic.getVectorGraphicData()->getPageIndex();
-        rEmit.m_aPDFData = *pPDFData;
+        sal_Int32 aIndex = m_aExternalPDFStreams.store(reinterpret_cast<const sal_uInt8*>(rArray.getConstArray()), nLength);
+        rEmit.m_nExternalPDFPageIndex = rGraphic.getVectorGraphicData()->getPageIndex();
+        rEmit.m_nExternalPDFDataIndex = aIndex;
     }
 
     rEmit.m_nFormObject = createObject();
diff --git a/vcl/source/gdi/pdfwriter_impl.hxx b/vcl/source/gdi/pdfwriter_impl.hxx
index f89cfd2e94f6..dc96454609f9 100644
--- a/vcl/source/gdi/pdfwriter_impl.hxx
+++ b/vcl/source/gdi/pdfwriter_impl.hxx
@@ -53,6 +53,7 @@
 
 #include <outdata.hxx>
 #include <vcl/filter/pdfobjectcontainer.hxx>
+#include <pdf/ExternalPDFStreams.hxx>
 #include "pdffontcache.hxx"
 #include "pdfbuildin_fonts.hxx"
 
@@ -190,19 +191,25 @@ struct ReferenceXObjectEmit
     /// Size of the bitmap replacement, in pixels.
     Size m_aPixelSize;
     /// PDF data from the graphic object, if not writing a reference XObject.
-    std::vector<sal_Int8> m_aPDFData;
-    sal_Int32 m_nPDFPageIndex;
+    sal_Int32 m_nExternalPDFDataIndex;
+    sal_Int32 m_nExternalPDFPageIndex;
 
     ReferenceXObjectEmit()
-        : m_nFormObject(0),
-          m_nEmbeddedObject(0),
-          m_nBitmapObject(0),
-          m_nPDFPageIndex(-1)
+        : m_nFormObject(0)
+        , m_nEmbeddedObject(0)
+        , m_nBitmapObject(0)
+        , m_nExternalPDFDataIndex(-1)
+        , m_nExternalPDFPageIndex(-1)
     {
     }
 
     /// Returns the ID one should use when referring to this bitmap.
     sal_Int32 getObject() const;
+
+    bool hasExternalPDFData() const
+    {
+        return m_nExternalPDFDataIndex >= 0;
+    }
 };
 
 struct BitmapEmit
@@ -740,6 +747,8 @@ private:
     osl::File                           m_aFile;
     bool                                m_bOpen;
 
+    ExternalPDFStreams m_aExternalPDFStreams;
+
     /* output redirection; e.g. to accumulate content streams for
        XObjects
      */
diff --git a/vcl/source/pdf/ExternalPDFStreams.cxx b/vcl/source/pdf/ExternalPDFStreams.cxx
new file mode 100644
index 000000000000..08f31ed22829
--- /dev/null
+++ b/vcl/source/pdf/ExternalPDFStreams.cxx
@@ -0,0 +1,43 @@
+/* -*- Mode: C++; tab-width: 4; indent-tabs-mode: nil; c-basic-offset: 4 -*- */
+/*
+ * This file is part of the LibreOffice project.
+ *
+ * This Source Code Form is subject to the terms of the Mozilla Public
+ * License, v. 2.0. If a copy of the MPL was not distributed with this
+ * file, You can obtain one at http://mozilla.org/MPL/2.0/.
+ *
+ */
+
+#include <pdf/ExternalPDFStreams.hxx>
+#include <comphelper/hash.hxx>
+
+namespace vcl
+{
+sal_Int32 ExternalPDFStreams::store(const sal_uInt8* pData, sal_uInt32 nLength)
+{
+    sal_Int32 nIndex = -1;
+
+    std::vector<sal_uInt8> aHash
+        = comphelper::Hash::calculateHash(pData, nLength, comphelper::HashType::SHA1);
+
+    auto it = maStreamIndexMap.find(aHash);
+    if (it == maStreamIndexMap.end())
+    {
+        auto& rExternalStream = maStreamList.emplace_back();
+        rExternalStream.maData.resize(nLength);
+        std::copy(pData, pData + nLength, rExternalStream.maData.begin());
+        nIndex = maStreamList.size() - 1;
+        maStreamIndexMap.emplace(aHash, nIndex);
+    }
+    else
+    {
+        nIndex = it->second;
+    }
+
+    return nIndex;
+}
+
+ExternalPDFStream& ExternalPDFStreams::get(sal_uInt32 nIndex) { return maStreamList.at(nIndex); }
+}
+
+/* vim:set shiftwidth=4 softtabstop=4 expandtab: */


More information about the Libreoffice-commits mailing list