[Libreoffice-commits] online.git: common/SigUtil.cpp common/SigUtil.hpp loolwsd.xml.in wsd/Admin.cpp wsd/Admin.hpp wsd/AdminModel.cpp wsd/AdminModel.hpp wsd/LOOLWSD.cpp

Gabriel Masei (via logerrit) logerrit at kemper.freedesktop.org
Tue Jun 9 09:26:02 UTC 2020


 common/SigUtil.cpp |    6 ++---
 common/SigUtil.hpp |    3 +-
 loolwsd.xml.in     |    7 +++++
 wsd/Admin.cpp      |   26 +++++++++++++++++++--
 wsd/Admin.hpp      |    4 +++
 wsd/AdminModel.cpp |   63 ++++++++++++++++++++++++++++++++++++++++++++++++++++-
 wsd/AdminModel.hpp |   60 ++++++++++++++++++++++++++++++++++++++++++--------
 wsd/LOOLWSD.cpp    |   15 ++++++++++++
 8 files changed, 167 insertions(+), 17 deletions(-)

New commits:
commit ac17984226dac695fbd68699e168e2332f1ab9c7
Author:     Gabriel Masei <gabriel.masei at 1and1.ro>
AuthorDate: Mon Jun 8 13:41:03 2020 +0300
Commit:     Michael Meeks <michael.meeks at collabora.com>
CommitDate: Tue Jun 9 11:25:44 2020 +0200

    admin: cleanup resource consuming kits
    
    Change-Id: Ifafbadc61b788adc90c03fb92e0231f9e599c360
    Reviewed-on: https://gerrit.libreoffice.org/c/online/+/95794
    Tested-by: Jenkins
    Tested-by: Jenkins CollaboraOffice <jenkinscollaboraoffice at gmail.com>
    Reviewed-by: Michael Meeks <michael.meeks at collabora.com>

diff --git a/common/SigUtil.cpp b/common/SigUtil.cpp
index 5a587f30d..4891010db 100644
--- a/common/SigUtil.cpp
+++ b/common/SigUtil.cpp
@@ -355,13 +355,13 @@ namespace SigUtil
         sigaction(SIGUSR1, &action, nullptr);
     }
 
-    /// Kill the given pid with SIGTERM.  Returns true when the pid does not exist any more.
-    bool killChild(const int pid)
+    /// Kill the given pid with SIGKILL as default.  Returns true when the pid does not exist any more.
+    bool killChild(const int pid, const int signal)
     {
         LOG_DBG("Killing PID: " << pid);
         // Don't kill anything in the fuzzer case: pid == 0 would kill the fuzzer itself, and
         // killing random other processes is not a great idea, either.
-        if (Util::isFuzzing() || kill(pid, SIGKILL) == 0 || errno == ESRCH)
+        if (Util::isFuzzing() || kill(pid, signal) == 0 || errno == ESRCH)
         {
             // Killed or doesn't exist.
             return true;
diff --git a/common/SigUtil.hpp b/common/SigUtil.hpp
index 9bccf0256..d3eaf9726 100644
--- a/common/SigUtil.hpp
+++ b/common/SigUtil.hpp
@@ -11,6 +11,7 @@
 
 #include <atomic>
 #include <mutex>
+#include <signal.h>
 
 #if MOBILEAPP
 static constexpr bool ShutdownRequestFlag(false);
@@ -73,7 +74,7 @@ namespace SigUtil
     /// Kills a child process and returns true when
     /// child pid is removed from the process table
     /// after a certain (short) timeout.
-    bool killChild(const int pid);
+    bool killChild(const int pid, const int signal = SIGKILL);
 
     /// Dump a signal-safe back-trace
     void dumpBacktrace();
diff --git a/loolwsd.xml.in b/loolwsd.xml.in
index e66f3526d..27e1d6e87 100644
--- a/loolwsd.xml.in
+++ b/loolwsd.xml.in
@@ -30,6 +30,13 @@
         <limit_num_open_files desc="The maximum number of files allowed to each document process to open. 0 for unlimited." type="uint">0</limit_num_open_files>
         <limit_load_secs desc="Maximum number of seconds to wait for a document load to succeed. 0 for unlimited." type="uint" default="100">100</limit_load_secs>
         <limit_convert_secs desc="Maximum number of seconds to wait for a document conversion to succeed. 0 for unlimited." type="uint" default="100">100</limit_convert_secs>
+        <cleanup desc="Checks for resource consuming (bad) documents and kills associated kit process. A document is considered resource consuming (bad) if is in idle state for idle_time_secs period and memory usage passed limit_dirty_mem_mb or CPU usage passed limit_cpu_per" enable="false">
+            <cleanup_interval_ms desc="Interval between two checks" type="uint" default="10000">10000</cleanup_interval_ms>
+            <bad_behavior_period_secs desc="Minimum time period for a document to be in bad state before associated kit process is killed. If in this period the condition for bad document is not met once then this period is reset" type="uint" default="60">60</bad_behavior_period_secs>
+            <idle_time_secs desc="Minimum idle time for a document to be candidate for bad state" type="uint" default="300">300</idle_time_secs>
+            <limit_dirty_mem_mb desc="Minimum memory usage for a document to be candidate for bad state" type="uint" default="3072">3072</limit_dirty_mem_mb>
+            <limit_cpu_per desc="Minimum CPU usage for a document to be candidate for bad state" type="uint" default="85">85</limit_cpu_per>
+        </cleanup>
     </per_document>
 
     <per_view desc="View-specific settings.">
diff --git a/wsd/Admin.cpp b/wsd/Admin.cpp
index 78956b222..c1aec45da 100644
--- a/wsd/Admin.cpp
+++ b/wsd/Admin.cpp
@@ -389,7 +389,8 @@ Admin::Admin() :
     _lastRecvCount(0),
     _cpuStatsTaskIntervalMs(DefStatsIntervalMs),
     _memStatsTaskIntervalMs(DefStatsIntervalMs * 2),
-    _netStatsTaskIntervalMs(DefStatsIntervalMs * 2)
+    _netStatsTaskIntervalMs(DefStatsIntervalMs * 2),
+    _cleanupIntervalMs(DefStatsIntervalMs * 10)
 {
     LOG_INF("Admin ctor.");
 
@@ -415,13 +416,14 @@ Admin::~Admin()
 
 void Admin::pollingThread()
 {
-    std::chrono::steady_clock::time_point lastCPU, lastMem, lastNet;
+    std::chrono::steady_clock::time_point lastCPU, lastMem, lastNet, lastCleanup;
 
     _model.setThreadOwner(std::this_thread::get_id());
 
     lastCPU = std::chrono::steady_clock::now();
     lastMem = lastCPU;
     lastNet = lastCPU;
+    lastCleanup = lastCPU;
 
     while (!isStop() && !SigUtil::getTerminationFlag() && !SigUtil::getShutdownRequestFlag())
     {
@@ -483,6 +485,19 @@ void Admin::pollingThread()
             lastNet = now;
         }
 
+        int cleanupWait = _cleanupIntervalMs -
+                std::chrono::duration_cast<std::chrono::milliseconds>(now - lastCleanup).count();
+        if (cleanupWait <= MinStatsIntervalMs / 2) // Close enough
+        {
+            if (_defDocProcSettings.getCleanupSettings().getEnable())
+            {
+                cleanupResourceConsumingDocs();
+                
+                cleanupWait += _cleanupIntervalMs;
+                lastCleanup = now;
+            }
+        }
+
         // (re)-connect (with sync. DNS - urk) to one monitor at a time
         if (_pendingConnects.size())
         {
@@ -495,7 +510,7 @@ void Admin::pollingThread()
         }
 
         // Handle websockets & other work.
-        const int timeout = capAndRoundInterval(std::min(std::min(cpuWait, memWait), netWait));
+        const int timeout = capAndRoundInterval(std::min(std::min(std::min(cpuWait, memWait), netWait), cleanupWait));
         LOG_TRC("Admin poll for " << timeout << "ms.");
         poll(timeout * 1000); // continue with ms for admin, settings etc.
     }
@@ -773,6 +788,11 @@ void Admin::notifyDocsMemDirtyChanged()
     _model.notifyDocsMemDirtyChanged();
 }
 
+void Admin::cleanupResourceConsumingDocs()
+{
+    _model.cleanupResourceConsumingDocs();
+}
+
 void Admin::dumpState(std::ostream& os)
 {
     // FIXME: be more helpful ...
diff --git a/wsd/Admin.hpp b/wsd/Admin.hpp
index 5a3feee61..f9532c59f 100644
--- a/wsd/Admin.hpp
+++ b/wsd/Admin.hpp
@@ -124,6 +124,8 @@ public:
     void setDefDocProcSettings(const DocProcSettings& docProcSettings, bool notifyKit)
     {
         _defDocProcSettings = docProcSettings;
+        _model.setDefDocProcSettings(docProcSettings);
+        _cleanupIntervalMs = _defDocProcSettings.getCleanupSettings().getCleanupInterval();
         if (notifyKit)
             notifyForkit();
     }
@@ -149,6 +151,7 @@ private:
     /// under @hardModeLimit
     void triggerMemoryCleanup(size_t hardModeLimit);
     void notifyDocsMemDirtyChanged();
+    void cleanupResourceConsumingDocs();
 
     /// Round the interval up to multiples of MinStatsIntervalMs.
     /// This is to avoid arbitrarily small intervals that hammer the server.
@@ -190,6 +193,7 @@ private:
     int _cpuStatsTaskIntervalMs;
     int _memStatsTaskIntervalMs;
     int _netStatsTaskIntervalMs;
+    size_t _cleanupIntervalMs;
     DocProcSettings _defDocProcSettings;
 
     // Don't update any more frequently than this since it's excessive.
diff --git a/wsd/AdminModel.cpp b/wsd/AdminModel.cpp
index 387796452..893c7739a 100644
--- a/wsd/AdminModel.cpp
+++ b/wsd/AdminModel.cpp
@@ -140,7 +140,7 @@ void Document::updateMemoryDirty()
     const time_t now = std::time(nullptr);
     if (now - _lastTimeSMapsRead >= 5)
     {
-        int lastMemDirty = _memoryDirty;
+        size_t lastMemDirty = _memoryDirty;
         _memoryDirty = _procSMaps  ? Util::getPssAndDirtyFromSMaps(_procSMaps).second : 0;
         _lastTimeSMapsRead = now;
         if (lastMemDirty != _memoryDirty)
@@ -148,6 +148,16 @@ void Document::updateMemoryDirty()
     }
 }
 
+void Document::setLastJiffies(size_t newJ)
+{
+    auto now = std::chrono::system_clock::now();
+    if (_lastJiffy)
+        _lastCpuPercentage = (100 * 1000 * (newJ - _lastJiffy) / ::sysconf(_SC_CLK_TCK))
+                            / std::chrono::duration_cast<std::chrono::milliseconds>(now - _lastJiffyTime).count();
+    _lastJiffy = newJ;
+    _lastJiffyTime = now;
+}
+
 bool Subscriber::notify(const std::string& message)
 {
     // If there is no socket, then return false to
@@ -692,6 +702,57 @@ std::vector<DocBasicInfo> AdminModel::getDocumentsSortedByIdle() const
     return docs;
 }
 
+void AdminModel::cleanupResourceConsumingDocs()
+{
+    DocCleanupSettings& settings = _defDocProcSettings.getCleanupSettings();
+
+    for (const auto& it: _documents)
+    {
+        Document *doc = it.second.get();
+        if (!doc->isExpired())
+        {
+            size_t idleTime = doc->getIdleTime();
+            size_t memDirty = doc->getMemoryDirty();
+            unsigned cpuPercentage = doc->getLastCpuPercentage();
+
+            if (idleTime >= settings.getIdleTime() &&
+                (memDirty >= settings.getLimitDirtyMem() * 1024 ||
+                 cpuPercentage >= settings.getLimitCpu()))
+            {
+                time_t now = std::time(nullptr);
+                const size_t badBehaviorDuration = now - doc->getBadBehaviorDetectionTime();
+                if (!doc->getBadBehaviorDetectionTime())
+                {
+                    LOG_WRN("Detected resource consuming doc [" << doc->getDocKey() << "]: idle="
+                            << idleTime << " s, memory=" << memDirty << " KB, CPU=" << cpuPercentage << "%.");
+                    doc->setBadBehaviorDetectionTime(now);
+                }
+                else if (badBehaviorDuration >= settings.getBadBehaviorPeriod())
+                {
+                    // We should not try to close it nicely (closeDocument) because
+                    // we could lose it: it will be removed from our internal lists
+                    // but the process itself can hang and continue to exist and
+                    // consume resources.
+                    // Also, try first to SIGABRT the kit process so that a stack trace
+                    // could be dumped. If the process is still alive then, at next
+                    // iteration, try to SIGKILL it.
+                    if (SigUtil::killChild(doc->getPid(), doc->getAbortTime() ? SIGKILL : SIGABRT))
+                        LOG_ERR((doc->getAbortTime() ? "Killed" : "Aborted") << " resource consuming doc [" << doc->getDocKey() << "]");
+                    else
+                        LOG_ERR("Cannot " << (doc->getAbortTime() ? "kill" : "abort") << " resource consuming doc [" << doc->getDocKey() << "]");
+                    if (!doc->getAbortTime())
+                        doc->setAbortTime(time_t(nullptr));
+                }
+            }
+            else if (doc->getBadBehaviorDetectionTime())
+            {
+                doc->setBadBehaviorDetectionTime(0);
+                LOG_WRN("Removed doc [" << doc->getDocKey() << "] from resource consuming monitoring list");
+            }
+        }
+    }
+}
+
 std::string AdminModel::getDocuments() const
 {
     assertCorrectThread();
diff --git a/wsd/AdminModel.hpp b/wsd/AdminModel.hpp
index 98bd01495..0a038ad20 100644
--- a/wsd/AdminModel.hpp
+++ b/wsd/AdminModel.hpp
@@ -51,6 +51,30 @@ private:
     std::chrono::milliseconds _loadDuration;
 };
 
+struct DocCleanupSettings
+{
+    void setEnable(bool enable) { _enable = enable; }
+    bool getEnable() const { return _enable; }
+    void setCleanupInterval(size_t cleanupInterval) { _cleanupInterval = cleanupInterval; }
+    size_t getCleanupInterval() const { return _cleanupInterval; }
+    void setBadBehaviorPeriod(size_t badBehaviorPeriod) { _badBehaviorPeriod = badBehaviorPeriod; }
+    size_t getBadBehaviorPeriod() const { return _badBehaviorPeriod; }
+    void setIdleTime(size_t idleTime) { _idleTime = idleTime; }
+    size_t getIdleTime() { return _idleTime; }
+    void setLimitDirtyMem(size_t limitDirtyMem) { _limitDirtyMem = limitDirtyMem; }
+    size_t getLimitDirtyMem() const { return _limitDirtyMem; }
+    void setLimitCpu(size_t limitCpu) { _limitCpu = limitCpu; }
+    size_t getLimitCpu() const { return _limitCpu; }
+
+private:
+    bool _enable;
+    size_t _cleanupInterval;
+    size_t _badBehaviorPeriod;
+    size_t _idleTime;
+    size_t _limitDirtyMem;
+    size_t _limitCpu;
+};
+
 struct DocProcSettings
 {
     void setLimitVirtMemMb(size_t limitVirtMemMb) { _limitVirtMemMb = limitVirtMemMb; }
@@ -62,11 +86,15 @@ struct DocProcSettings
     void setLimitNumberOpenFiles(size_t limitNumberOpenFiles) { _limitNumberOpenFiles = limitNumberOpenFiles; }
     size_t getLimitNumberOpenFiles() const { return _limitNumberOpenFiles; }
 
+    DocCleanupSettings& getCleanupSettings() { return _docCleanupSettings; }
+
 private:
     size_t _limitVirtMemMb;
     size_t _limitStackMemKb;
     size_t _limitFileSizeMb;
     size_t _limitNumberOpenFiles;
+
+    DocCleanupSettings _docCleanupSettings;
 };
 
 /// Containing basic information about document
@@ -112,6 +140,7 @@ public:
           _filename(filename),
           _memoryDirty(0),
           _lastJiffy(0),
+          _lastCpuPercentage(0),
           _start(std::time(nullptr)),
           _lastActivity(_start),
           _end(0),
@@ -122,7 +151,9 @@ public:
           _procSMaps(nullptr),
           _lastTimeSMapsRead(0),
           _isModified(false),
-          _hasMemDirtyChanged(true)
+          _hasMemDirtyChanged(true),
+          _badBehaviorDetectionTime(0),
+          _abortTime(0)
     {
     }
 
@@ -151,13 +182,14 @@ public:
     unsigned getActiveViews() const { return _activeViews; }
 
     unsigned getLastJiffies() const { return _lastJiffy; }
-    void setLastJiffies(size_t newJ) { _lastJiffy = newJ; }
+    void setLastJiffies(size_t newJ);
+    unsigned getLastCpuPercentage(){ return _lastCpuPercentage; }
 
     const std::map<std::string, View>& getViews() const { return _views; }
 
     void updateLastActivityTime() { _lastActivity = std::time(nullptr); }
     void updateMemoryDirty();
-    int getMemoryDirty() const { return _memoryDirty; }
+    size_t getMemoryDirty() const { return _memoryDirty; }
 
     std::pair<std::time_t, std::string> getSnapshot() const;
     const std::string getHistory() const;
@@ -172,9 +204,6 @@ public:
         _recvBytes += recv;
     }
 
-    const DocProcSettings& getDocProcSettings() const { return _docProcSettings; }
-    void setDocProcSettings(const DocProcSettings& docProcSettings) { _docProcSettings = docProcSettings; }
-
     std::time_t getOpenTime() const { return isExpired() ? _end - _start : getElapsedTime(); }
     uint64_t getSentBytes() const { return _sentBytes; }
     uint64_t getRecvBytes() const { return _recvBytes; }
@@ -186,6 +215,10 @@ public:
     void setProcSMapsFD(const int smapsFD) { _procSMaps = fdopen(smapsFD, "r"); }
     bool hasMemDirtyChanged() const { return _hasMemDirtyChanged; }
     void setMemDirtyChanged(bool changeStatus) { _hasMemDirtyChanged = changeStatus; }
+    time_t getBadBehaviorDetectionTime(){ return _badBehaviorDetectionTime; }
+    void setBadBehaviorDetectionTime(time_t badBehaviorDetectionTime){ _badBehaviorDetectionTime = badBehaviorDetectionTime;}
+    time_t getAbortTime(){ return _abortTime; }
+    void setAbortTime(time_t abortTime) { _abortTime = abortTime; }
 
     std::string to_string() const;
 
@@ -199,9 +232,11 @@ private:
     /// Hosted filename
     std::string _filename;
     /// The dirty (ie. un-shared) memory of the document's Kit process.
-    int _memoryDirty;
+    size_t _memoryDirty;
     /// Last noted Jiffy count
     unsigned _lastJiffy;
+    std::chrono::time_point<std::chrono::system_clock> _lastJiffyTime;
+    unsigned _lastCpuPercentage;
 
     std::time_t _start;
     std::time_t _lastActivity;
@@ -218,10 +253,11 @@ private:
     FILE* _procSMaps;
     std::time_t _lastTimeSMapsRead;
 
-    /// Per-doc kit process settings.
-    DocProcSettings _docProcSettings;
     bool _isModified;
     bool _hasMemDirtyChanged;
+
+    std::time_t _badBehaviorDetectionTime;
+    std::time_t _abortTime;
 };
 
 /// An Admin session subscriber.
@@ -332,6 +368,7 @@ public:
 
     /// Document basic info list sorted by most idle time
     std::vector<DocBasicInfo> getDocumentsSortedByIdle() const;
+    void cleanupResourceConsumingDocs();
 
     void setViewLoadDuration(const std::string& docKey, const std::string& sessionId, std::chrono::milliseconds viewLoadDuration);
     void setDocWopiDownloadDuration(const std::string& docKey, std::chrono::milliseconds wopiDownloadDuration);
@@ -345,6 +382,9 @@ public:
     void UpdateMemoryDirty();
     void notifyDocsMemDirtyChanged();
 
+    const DocProcSettings& getDefDocProcSettings() const { return _defDocProcSettings; }
+    void setDefDocProcSettings(const DocProcSettings& docProcSettings) { _defDocProcSettings = docProcSettings; }
+
     static int getPidsFromProcName(const std::regex& procNameRegEx, std::vector<int> *pids);
 
 private:
@@ -391,6 +431,8 @@ private:
 
     /// We check the owner even in the release builds, needs to be always correct.
     std::thread::id _owner;
+
+    DocProcSettings _defDocProcSettings;
 };
 
 /* vim:set shiftwidth=4 softtabstop=4 expandtab: */
diff --git a/wsd/LOOLWSD.cpp b/wsd/LOOLWSD.cpp
index 457e109b5..93858ad28 100644
--- a/wsd/LOOLWSD.cpp
+++ b/wsd/LOOLWSD.cpp
@@ -906,6 +906,12 @@ void LOOLWSD::initialize(Application& self)
             { "num_prespawn_children", "1" },
             { "per_document.always_save_on_exit", "false" },
             { "per_document.autosave_duration_secs", "300" },
+            { "per_document.cleanup.cleanup_interval_ms", "10000" },
+            { "per_document.cleanup.bad_behavior_period_secs", "60" },
+            { "per_document.cleanup.idle_time_secs", "300" },
+            { "per_document.cleanup.limit_dirty_mem_mb", "3072" },
+            { "per_document.cleanup.limit_cpu_per", "85" },
+            { "per_document.cleanup[@enable]", "false" },
             { "per_document.document_signing_url", VEREIGN_URL },
             { "per_document.idle_timeout_secs", "3600" },
             { "per_document.idlesave_duration_secs", "30" },
@@ -1292,6 +1298,15 @@ void LOOLWSD::initialize(Application& self)
     docProcSettings.setLimitStackMemKb(getConfigValue<int>("per_document.limit_stack_mem_kb", 0));
     docProcSettings.setLimitFileSizeMb(getConfigValue<int>("per_document.limit_file_size_mb", 0));
     docProcSettings.setLimitNumberOpenFiles(getConfigValue<int>("per_document.limit_num_open_files", 0));
+
+    DocCleanupSettings &docCleanupSettings = docProcSettings.getCleanupSettings();
+    docCleanupSettings.setEnable(getConfigValue<bool>("per_document.cleanup[@enable]", false));
+    docCleanupSettings.setCleanupInterval(getConfigValue<int>("per_document.cleanup.cleanup_interval_ms", 10000));
+    docCleanupSettings.setBadBehaviorPeriod(getConfigValue<int>("per_document.cleanup.bad_behavior_period_secs", 60));
+    docCleanupSettings.setIdleTime(getConfigValue<int>("per_document.cleanup.idle_time_secs", 300));
+    docCleanupSettings.setLimitDirtyMem(getConfigValue<int>("per_document.cleanup.limit_dirty_mem_mb", 3072));
+    docCleanupSettings.setLimitCpu(getConfigValue<int>("per_document.cleanup.limit_cpu_per", 85));
+
     Admin::instance().setDefDocProcSettings(docProcSettings, false);
 
 #if ENABLE_DEBUG


More information about the Libreoffice-commits mailing list