Bugzilla – Bug 10979
clamd: Keep scanning while reloading database
Last modified: 2020-10-21 20:03:10 EDT
Patch sent by Julius Plenz. Also reference clamlist discussion started April 7, 2014. Hi, * Julius Plenz <plenz@cis.fu-berlin.de> [2014-04-07 20:43]: > > 1) Start reload_db() in a background thread, resume scanning, and call > > back once the new engine is in place; then exchange the pointers > > from old to new engine and free the old one. > > FWIW, I have implemented this option, and it seems to work just > fine. We have been running ClamAV on all incoming mail for the past week while periodically reloading a fresh definition database, and it works very well. Any chance of getting this included into upstream? Current patch against Git tag clamav-0.98.1 pasted below. Cheers, Julius -- clamd/server-th.c | 145 ++++++++++++++++++++++++++++++++---------------------- clamd/server.h | 7 +++ 2 files changed, 94 insertions(+), 58 deletions(-) diff --git a/clamd/server-th.c b/clamd/server-th.c index 86a3a48..3df5c9c 100644 --- a/clamd/server-th.c +++ b/clamd/server-th.c @@ -64,6 +64,8 @@ int progexit = 0; pthread_mutex_t exit_mutex = PTHREAD_MUTEX_INITIALIZER; int reload = 0; +volatile int reload_in_progress = 0; +struct cl_engine *reload_engine = NULL; time_t reloaded_time = 0; pthread_mutex_t reload_mutex = PTHREAD_MUTEX_INITIALIZER; int sighup = 0; @@ -158,40 +160,27 @@ void sighandler_th(int sig) logg("$Failed to write to syncpipe\n"); } -static struct cl_engine *reload_db(struct cl_engine *engine, unsigned int dboptions, const struct optstruct *opts, int do_check, int *ret) +static void *reload_db(void *argp) { - const char *dbdir; - int retval; - unsigned int sigs = 0; - struct cl_settings *settings = NULL; - - *ret = 0; - if(do_check) { - if(!dbstat.entries) { - logg("No stats for Database check - forcing reload\n"); - return engine; - } - - if(cl_statchkdir(&dbstat) == 1) { - logg("SelfCheck: Database modification detected. Forcing reload.\n"); - return engine; - } else { - logg("SelfCheck: Database status OK.\n"); - return NULL; - } - } - - /* release old structure */ - if(engine) { + const char *dbdir; + int retval; + unsigned int sigs = 0; + struct cl_settings *settings = NULL; + struct cl_engine *engine = NULL; + + struct reload_db_args *args = (struct reload_db_args *)argp; + struct cl_engine *oldengine = args->engine; + unsigned int dboptions = args->dboptions; + const struct optstruct *opts = args->opts; + + if(oldengine) { /* copy current settings */ - settings = cl_engine_settings_copy(engine); + settings = cl_engine_settings_copy(oldengine); if(!settings) logg("^Can't make a copy of the current engine settings\n"); - - thrmgr_setactiveengine(NULL); - cl_engine_free(engine); } + dbdir = optget(opts, "DatabaseDirectory")->strarg; logg("Reading databases from %s\n", dbdir); @@ -201,18 +190,12 @@ static struct cl_engine *reload_db(struct cl_engine *engine, unsigned int dbopti memset(&dbstat, 0, sizeof(struct cl_stat)); if((retval = cl_statinidir(dbdir, &dbstat))) { logg("!cl_statinidir() failed: %s\n", cl_strerror(retval)); - *ret = 1; - if(settings) - cl_engine_settings_free(settings); - return NULL; + goto err; } if(!(engine = cl_engine_new())) { logg("!Can't initialize antivirus engine\n"); - *ret = 1; - if(settings) - cl_engine_settings_free(settings); - return NULL; + goto err; } if(settings) { @@ -222,25 +205,38 @@ static struct cl_engine *reload_db(struct cl_engine *engine, unsigned int dbopti logg("^Using default engine settings\n"); } cl_engine_settings_free(settings); + settings = NULL; } if((retval = cl_load(dbdir, engine, &sigs, dboptions))) { logg("!reload db failed: %s\n", cl_strerror(retval)); - cl_engine_free(engine); - *ret = 1; - return NULL; + goto err; } if((retval = cl_engine_compile(engine)) != 0) { logg("!Database initialization error: can't compile engine: %s\n", cl_strerror(retval)); - cl_engine_free(engine); - *ret = 1; - return NULL; + goto err; } logg("Database correctly reloaded (%u signatures)\n", sigs); - thrmgr_setactiveengine(engine); - return engine; + pthread_mutex_lock(&reload_mutex); + time(&reloaded_time); + reload_engine = engine; + goto end; + +err: + if(settings) + cl_engine_settings_free(settings); + if(engine) + cl_engine_free(engine); + + pthread_mutex_lock(&reload_mutex); + +end: + reload_in_progress = 0; + pthread_mutex_unlock(&reload_mutex); + + free(args); } /* @@ -713,6 +709,7 @@ int recvloop_th(int *socketds, unsigned nsockets, struct cl_engine *engine, unsi unsigned long long val; size_t i, j, rr_last = 0; pthread_t accept_th; + pthread_t reload_th; pthread_mutex_t fds_mutex = PTHREAD_MUTEX_INITIALIZER; pthread_mutex_t recvfds_mutex = PTHREAD_MUTEX_INITIALIZER; struct acceptdata acceptdata = ACCEPTDATA_INIT(&fds_mutex, &recvfds_mutex); @@ -1347,10 +1344,19 @@ int recvloop_th(int *socketds, unsigned nsockets, struct cl_engine *engine, unsi if(selfchk) { time(¤t_time); if((current_time - start_time) >= (time_t)selfchk) { - if(reload_db(engine, dboptions, opts, TRUE, &ret)) { + if(!dbstat.entries) { + logg("No stats for Database check - forcing reload\n"); + pthread_mutex_lock(&reload_mutex); + reload = 1; + pthread_mutex_unlock(&reload_mutex); + } + else if(cl_statchkdir(&dbstat) == 1) { + logg("SelfCheck: Database modification detected. Forcing reload.\n"); pthread_mutex_lock(&reload_mutex); reload = 1; pthread_mutex_unlock(&reload_mutex); + } else { + logg("SelfCheck: Database status OK.\n"); } time(&start_time); } @@ -1358,34 +1364,57 @@ int recvloop_th(int *socketds, unsigned nsockets, struct cl_engine *engine, unsi /* DB reload */ pthread_mutex_lock(&reload_mutex); - if(reload) { + if(reload && reload_in_progress) { + logg("Database reload already in progress; ignoring reload request.\n"); + reload = 0; + pthread_mutex_unlock(&reload_mutex); + } else if(reload) { pthread_mutex_unlock(&reload_mutex); + struct reload_db_args *reload_db_args; + reload_db_args = (struct reload_db_args *) malloc(sizeof(struct reload_db_args)); + if (reload_db_args) + { + reload_db_args->engine = engine; + reload_db_args->dboptions = dboptions; + reload_db_args->opts = opts; + + pthread_mutex_lock(&reload_mutex); + reload = 0; + reload_in_progress = 1; + + if (pthread_create(&reload_th, NULL, reload_db, reload_db_args) != 0) { + logg("*Error dispatching DB reload thread!\n"); + reload_in_progress = 0; + free(reload_db_args); + } + pthread_mutex_unlock(&reload_mutex); + } + } else if(reload_engine != NULL) { #if defined(FANOTIFY) || defined(CLAMAUTH) + pthread_mutex_unlock(&reload_mutex); if(optget(opts, "ScanOnAccess")->enabled && tharg) { logg("Restarting on-access scan\n"); pthread_kill(fan_pid, SIGUSR1); pthread_join(fan_pid, NULL); } -#endif - engine = reload_db(engine, dboptions, opts, FALSE, &ret); - if(ret) { - logg("Terminating because of a fatal error.\n"); - if(new_sd >= 0) - closesocket(new_sd); - break; - } - pthread_mutex_lock(&reload_mutex); - reload = 0; - time(&reloaded_time); +#endif + static struct cl_engine *tmp; + tmp = engine; + engine = reload_engine; + thrmgr_setactiveengine(engine); + time(&start_time); + reload_engine = NULL; pthread_mutex_unlock(&reload_mutex); + + cl_engine_free(tmp); + #if defined(FANOTIFY) || defined(CLAMAUTH) if(optget(opts, "ScanOnAccess")->enabled && tharg) { tharg->engine = engine; pthread_create(&fan_pid, &fan_attr, fan_th, tharg); } #endif - time(&start_time); } else { pthread_mutex_unlock(&reload_mutex);
The original patch has the bug, that the reload threads never get cleaned up. We modified the patch, so that the threads are started with the PTHREAD_CREATE_DETACHED attribute. I will attach the current patch against 0.99.2. We are using the patch in production since 2014 and have never experienced any problem. Looking forward to get this included upstream.
Created attachment 7196 [details] patch against 0.99.2
I had time to test this patch. Idea is good but there are some issues which need addressing. unit_test reload_test fails with this patch applied which is first mark about problems. Another mark is that in low-volume mail system where it is normal to have a week without any problmes from clamd (no timeouts, no scan failures) this caused 10 email checks in a day failing. And when clamd failed, also clamscan failed. So while idea is good, patch is not ready for prime time yet. Scanning must not fail because of DB reload.
We're running ClamAV version 0.99.3 on multiple hosts, and are having similar issues using clamd's INSTREAM functionality during reload of the definition database. Our tests shows that clamd is unavailable for ~10seconds during DB reload. Is this an issue that will be prioritized in an upcoming version of ClamAV?
We are interested in this feature request, but aren't committing to it for v0.101. For reference, v0.101 is targeted for release end of this year (2018).
Hi, is this being planned for the next release after v0.101 ?
We're just starting 0.102 planning next week. Over the next month or so we'll be organizing new and existing requirements and ranking them. I will make a note to get back to you when I know if this will be planned for 0.102.
Hi Micah, Just wanted to check if there has been any update regarding this bug. Thank you.
Hi Sumit, We're aiming to include this feature request in our 0.102 release, though our list of 0.102 tasks is still much larger than we can commit to and not everything will make it before our code freeze goal, which is early April. I'm cautiously optimistic that we'll fit this in. Regards, Micah
Any update on progress for this one?
We are not actively working on this, though it is on our list.
Hi, I think i stumbled into this issue too when using squidclamav (ICAP service) and clamav on various firewall systems with additional ruleset providers e.g. securiteinfo / sanesecurity . If you don't have potent enough hardware (CPU power) all requests can be delayed up to the time it takes to completely reload the databases. For instance if it takes 10 minutes to reload all databases all HTTP requests get stalled for 10 minutes. I hope you will find time to fix this issue. Thank you very much. Best Regards Christian Czeczil
Created attachment 7583 [details] Replacement for .../clamav-0.101.4/clamd/server.h Replacement for .../clamav-0.101.4/clamd/server.h
Created attachment 7584 [details] Replacement for .../clamav-0.101.4/clamd/server-th.c Replacement for .../clamav-0.101.4/clamd/server-th.c
Using clamav-0.101.4 I have been running the clamd patch from comment 2, attachment 7196 [details], for several weeks on a not very busy mail server. During this time I have been running both the patched version of clamd and the unpatched version. Using my own milter I have scanned all our mail, a few thousand messages per day, using *both* clamd versions (consecutively) as the mail arrived. While loading new databases, the patched version scans concurrently as expected. In scanning performance, as far as I can tell the two versions are identical. The only failures have been the usual failures to detect malicious mail or spam. Those failures have also been identical in both versions. ============================ Although it is several years old, the patch is not too difficult to apply by hand if you're used to doing things like that, but it might be daunting to a newcomer. For that reason I have attached two files, to replace those of the same names in .../clamav-0.101.4/clamd/ If anyone would care to test this fix on a busier server I should be pleased to hear the results. Rather than make the changes as compact as they could be, for those who would like to satisfy themselves that there is nothing malicious in the patch, I have tried to make the changes, and their intent, as obvious as I can. Please feel free to compare these files with the originals from 0.101.4.
if it's any help: I rebuilt the clamav packages (current debian buster sources) and exchanged the clamav-daemon with the patched version on a firewall system with squidclamav. Seems to work like a charm for now - clamd keeps scanning while reloading the database - the affected firewall is integrated into my monitoring so i should notice if the patch has any general system wide negative impact on LOAD or RAM usage in comparison to historic data. Best Regards Christian Czeczil
Hi Christian, Yes, that is very helpful. Thank you. I expect clamd RAM usage to approximately double during the database reload, then drop back to whatever you normally see. When you have more experience of the patched version, I feel sure that many of us will be pleased to see more comment from you. 73 Ged
Here is something immediate i can give you: -System is 4GB RAM - https://pcengines.ch/apu2.htm unit , with 16GB mSATA SSD -Currently running (Debian Buster): Linux version 4.19.0-6-amd64 (debian-kernel@lists.debian.org) (gcc version 8.3.0 (Debian 8.3.0-6)) #1 SMP Debian 4.19.67-2 (2019-08-28) No Reload: PID USER PRI NI VIRT RES SHR S CPU% MEM% TIME+ Command 20080 clamav 20 0 1912M 1555M 3772 S 0.0 39.6 0:00.20 /usr/sbin/clamd --foreground=true 19223 clamav 20 0 1912M 1555M 3772 S 0.0 39.6 3h09:34 /usr/sbin/clamd --foreground=true Trigger reload via clamdscan --reload @Wed 11 Sep 14:19:30 CEST 2019 Sep 11 14:19:30 clamd[19223]: Wed Sep 11 14:19:30 2019 -> Reading databases from /var/lib/clamav Sep 11 14:19:30 clamd[19223]: Reading databases from /var/lib/clamav ~5 minutes after reload: PID USER PRI NI VIRT RES SHR S CPU% MEM% TIME+ Command 19223 clamav 20 0 2342M 1960M 3772 S 99.5 49.9 3h14:10 /usr/sbin/clamd --foreground=true 27629 clamav 20 0 2342M 1960M 3772 R 99.5 49.9 4:36.15 /usr/sbin/clamd --foreground=true 20080 clamav 20 0 2342M 1960M 3772 S 0.0 49.9 0:00.20 /usr/sbin/clamd --foreground=true ~10 minutes after reload: PID USER PRI NI VIRT RES SHR S CPU% MEM% TIME+ Command 27629 clamav 20 0 2975M 2589M 3772 R 100. 65.9 8:49.50 /usr/sbin/clamd --foreground=true 19223 clamav 20 0 2975M 2589M 3772 S 100. 65.9 3h18:24 /usr/sbin/clamd --foreground=true 20080 clamav 20 0 2975M 2589M 3772 S 0.0 65.9 0:00.20 /usr/sbin/clamd --foreground=true ~12 minutes after reload: PID USER PRI NI VIRT RES SHR S CPU% MEM% TIME+ Command 19223 clamav 20 0 3480M 3045M 3772 S 100. 77.5 3h21:10 /usr/sbin/clamd --foreground=true 27629 clamav 20 0 3480M 3045M 3772 R 100. 77.5 11:35.54 /usr/sbin/clamd --foreground=true 20080 clamav 20 0 3480M 3045M 3772 S 0.0 77.5 0:00.20 /usr/sbin/clamd --foreground=true After Database correctly reloaded - Eicar-Test is due to monitoring checking if antivirus is working: Logs: === Sep 11 14:19:30 clamd[19223]: Wed Sep 11 14:19:30 2019 -> Reading databases from /var/lib/clamav Sep 11 14:19:30 clamd[19223]: Reading databases from /var/lib/clamav Sep 11 14:23:32 clamd[19223]: Wed Sep 11 14:23:32 2019 -> instream(local): Eicar-Test-Signature(44d88612fea8a8f36de82e1278abb02f:68) FOUND Sep 11 14:23:32 clamd[19223]: instream(local): Eicar-Test-Signature(44d88612fea8a8f36de82e1278abb02f:68) FOUND Sep 11 14:28:32 clamd[19223]: Wed Sep 11 14:28:32 2019 -> instream(local): Eicar-Test-Signature(44d88612fea8a8f36de82e1278abb02f:68) FOUND Sep 11 14:28:32 clamd[19223]: instream(local): Eicar-Test-Signature(44d88612fea8a8f36de82e1278abb02f:68) FOUND Sep 11 14:31:16 clamd[19223]: Wed Sep 11 14:31:16 2019 -> Database correctly reloaded (12898587 signatures) Sep 11 14:31:16 clamd[19223]: Database correctly reloaded (12898587 signatures) === PID USER PRI NI VIRT RES SHR S CPU% MEM% TIME+ Command 19223 clamav 20 0 3480M 3045M 3772 S 0.0 77.5 3h21:17 /usr/sbin/clamd --foreground=true 20080 clamav 20 0 3480M 3045M 3772 S 0.0 77.5 0:00.20 /usr/sbin/clamd --foreground=truef -- Memory Usage drops again: PID USER PRI NI VIRT RES SHR S CPU% MEM% TIME+ Command 19223 clamav 20 0 1912M 1544M 3836 S 0.0 39.3 3h21:23 /usr/sbin/clamd --foreground=true 27880 clamav 20 0 1912M 1544M 3836 S 0.0 39.3 0:00.05 /usr/sbin/clamd --foreground=true 20080 clamav 20 0 1912M 1544M 3836 S 0.0 39.3 0:00.20 /usr/sbin/clamd --foreground=true If my monitoring indicates something usefull for this case i will post it in the days/weeks to come. Best Regards Christian Czeczil
Created attachment 7586 [details] clamd-threaded-reloading patch Alberto Wu kindly provided us with a patch today that provides multithreaded database reloading. I made some tweaks to the patch, but this is mostly his work. If anyone here is willing to test it out, I would appreciate your help validating the stability of the patch. In my own testing it seemed to work well on macOS and Windows. I will note that this patch doesn't include any option to reload the traditional way. That capability should probably be added, for users who have limited RAM. On my Windows machine, I noted that the RAM climbed from about 745MB to about 1465MB during the reload and that upon completion, it dropped back to 745MB. Albe's notes about the patch: > What it does is offloading the DB load to a separate thread and only replace the current engine instance afterwards. > > That means that while reload is pending: > - existing scan requests use the old db (this is unchanged) > - new scan requests are honored instead of blocked and they also use the old db (this is new) > > After the reload is complete: > - existing scan requests use the old db (this is unchanged) > - new scan requests use the new db (this is unchanged) > > The existing engine is refcounted so it'll be eventually freed when no longer in use. > > Reload requests while reload is pending are silently ignored (i.e. I never fork more than a single reload thread). I think this is perfectly acceptable as the reload time is a few seconds (10-60) while the db update frequency is several hours. If someone has stupidly short release cycles they can still rely on SelfCheck.
I forgot to mention, the above patch applies to the current head of dev/0.102 from https://github.com/Cisco-Talos/clamav-devel
Created attachment 7613 [details] Allow reload thread time to finish The threaded reload patch probably requires this patch to allow the reload to finish in the unit tests. I noticed random failures in this test, most likely because of a race condition between signalling a reload and starting a test which expects the reload to have finished.
+ echo 'Patch #0 (clamd-threaded-reloading.patch):' Patch #0 (clamd-threaded-reloading.patch): + /usr/bin/cat ~/rpmbuild/SOURCES/clamd-threaded-reloading.patch + /usr/bin/patch -p1 -b --suffix .threaded_reloading --fuzz=0 patching file clamd/clamd.c Reversed (or previously applied) patch detected! Assume -R? [n] Does that mean I shouldn't apply the patch to 0.102.2?
(In reply to Reio Remma from comment #22) > + echo 'Patch #0 (clamd-threaded-reloading.patch):' > Patch #0 (clamd-threaded-reloading.patch): > + /usr/bin/cat ~/rpmbuild/SOURCES/clamd-threaded-reloading.patch > + /usr/bin/patch -p1 -b --suffix .threaded_reloading --fuzz=0 > patching file clamd/clamd.c > Reversed (or previously applied) patch detected! Assume -R? [n] > > Does that mean I shouldn't apply the patch to 0.102.2? In another news 0.102.1 with the patch has worked flawlessly on CentOS 7 for me.
Patch from Ged works flawless here, first clamav-0.101.4 and now clamav-0.102.2 on FreeBSD (pfSense). Please make this into the official release!
FYI clamav-0.102.3 runs fine with the patch as well. Bug.
(In reply to Micah Snyder from comment #19) > Created attachment 7586 [details] > clamd-threaded-reloading patch > > Alberto Wu kindly provided us with a patch today that provides multithreaded > database reloading. I made some tweaks to the patch, but this is mostly his > work. > > If anyone here is willing to test it out, I would appreciate your help > validating the stability of the patch. In my own testing it seemed to work > well on macOS and Windows. I will note that this patch doesn't include any > option to reload the traditional way. That capability should probably be > added, for users who have limited RAM. > > On my Windows machine, I noted that the RAM climbed from about 745MB to > about 1465MB during the reload and that upon completion, it dropped back to > 745MB. > > Albe's notes about the patch: > > > What it does is offloading the DB load to a separate thread and only replace the current engine instance afterwards. > > > > That means that while reload is pending: > > - existing scan requests use the old db (this is unchanged) > > - new scan requests are honored instead of blocked and they also use the old db (this is new) > > > > After the reload is complete: > > - existing scan requests use the old db (this is unchanged) > > - new scan requests use the new db (this is unchanged) > > > > The existing engine is refcounted so it'll be eventually freed when no longer in use. > > > > Reload requests while reload is pending are silently ignored (i.e. I never fork more than a single reload thread). I think this is perfectly acceptable as the reload time is a few seconds (10-60) while the db update frequency is several hours. If someone has stupidly short release cycles they can still rely on SelfCheck. https://github.com/Cisco-Talos/clamav-devel/pull/123 made from this patch. We running it on our testing server to see how it'll work out.
(In reply to David Heidelberg from comment #26) > > https://github.com/Cisco-Talos/clamav-devel/pull/123 made from this patch. > We running it on our testing server to see how it'll work out. Thanks David for your work moving this forward. As noted in comments added to your PR -- an additional option was required to do a blocking reload for users with tighter memory constraints. To that end, I have prepared a new PR which I have cross-posted on the public github mirror: https://github.com/Cisco-Talos/clamav-devel/pull/126/files to include the non-blocking/concurrent/multi-threaded clamd database reload feature along with an option to block on the reload. In this PR I used the patch provided by Alberto Wu but I wanted to take the time to thank everyone who has worked on this feature as it's definitely been a community effort. Thanks Julius Plenz, Alexander Sulfrian, Arjen de Korte, David Heidelberg, Ged Haywood, and anyone else who's had a hand in crafting and testing this feature. The blocking reload option is primarily for users that are limited on RAM and cannot afford to have two clamav engines loaded at the same time. Please try out both the blocking and non-blocking reloads in this PR before we merge it to make sure I haven't screwed anything up when adding the `ConcurrentDatabaseReload no` option. Micah
As of this afternoon I'm running two instances of clamd side-by-side on our clamd server, both compiled from Micah's patched version 0.103. The concurrent reload patch is disabled in one instance and enabled in the other. The two instances use the same database files. Mail messages are passed from our mail server to the clamd instances over TCP sockets (by my own milter - we do not use clamav-milter) if the messages get as far as the EOM callback in the milter. The messages are scanned using the two clamd instances sequentially. If anything unexpected happens, I'll let you know.
(In reply to Ged from comment #28) > As of this afternoon I'm running two instances of clamd side-by-side on our > clamd server, both compiled from Micah's patched version 0.103. The > concurrent reload patch is disabled in one instance and enabled in the > other. The two instances use the same database files. Mail messages are > passed from our mail server to the clamd instances over TCP sockets (by my > own milter - we do not use clamav-milter) if the messages get as far as the > EOM callback in the milter. The messages are scanned using the two clamd > instances sequentially. > > If anything unexpected happens, I'll let you know. You're awesome Ged! Thank you!
Just merged this feature into dev/0.103. https://github.com/Cisco-Talos/clamav-devel/compare/cdbc833...34c3441bcd3a
*** Bug 12626 has been marked as a duplicate of this bug. ***