* @author Arthur Schiwon * @author Björn Schießle * @author Christoph Wurst * @author Daniel Jagszent * @author Joas Schilling * @author Jörn Friedrich Dreyer * @author Lukas Reschke * @author Martin Mattel * @author Morris Jobke * @author Owen Winkler * @author Robin Appelman * @author Robin McCorkell * @author Thomas Müller * @author Vincent Petry * * @license AGPL-3.0 * * This code is free software: you can redistribute it and/or modify * it under the terms of the GNU Affero General Public License, version 3, * as published by the Free Software Foundation. * * This program is distributed in the hope that it will be useful, * but WITHOUT ANY WARRANTY; without even the implied warranty of * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the * GNU Affero General Public License for more details. * * You should have received a copy of the GNU Affero General Public License, version 3, * along with this program. If not, see * */ namespace OC\Files\Cache; use Doctrine\DBAL\Exception; use OC\Files\Storage\Wrapper\Encryption; use OC\Files\Storage\Wrapper\Jail; use OC\Hooks\BasicEmitter; use OC\SystemConfig; use OCP\Files\Cache\IScanner; use OCP\Files\ForbiddenException; use OCP\Files\NotFoundException; use OCP\Files\Storage\IReliableEtagStorage; use OCP\IDBConnection; use OCP\Lock\ILockingProvider; use Psr\Log\LoggerInterface; /** * Class Scanner * * Hooks available in scope \OC\Files\Cache\Scanner: * - scanFile(string $path, string $storageId) * - scanFolder(string $path, string $storageId) * - postScanFile(string $path, string $storageId) * - postScanFolder(string $path, string $storageId) * * @package OC\Files\Cache */ class Scanner extends BasicEmitter implements IScanner { /** * @var \OC\Files\Storage\Storage $storage */ protected $storage; /** * @var string $storageId */ protected $storageId; /** * @var \OC\Files\Cache\Cache $cache */ protected $cache; /** * @var boolean $cacheActive If true, perform cache operations, if false, do not affect cache */ protected $cacheActive; /** * @var bool $useTransactions whether to use transactions */ protected $useTransactions = true; /** * @var \OCP\Lock\ILockingProvider */ protected $lockingProvider; protected IDBConnection $connection; public function __construct(\OC\Files\Storage\Storage $storage) { $this->storage = $storage; $this->storageId = $this->storage->getId(); $this->cache = $storage->getCache(); /** @var SystemConfig $config */ $config = \OC::$server->get(SystemConfig::class); $this->cacheActive = !$config->getValue('filesystem_cache_readonly', false); $this->useTransactions = !$config->getValue('filescanner_no_transactions', false); $this->lockingProvider = \OC::$server->get(ILockingProvider::class); $this->connection = \OC::$server->get(IDBConnection::class); } /** * Whether to wrap the scanning of a folder in a database transaction * On default transactions are used * * @param bool $useTransactions */ public function setUseTransactions($useTransactions) { $this->useTransactions = $useTransactions; } /** * get all the metadata of a file or folder * * * * @param string $path * @return array|null an array of metadata of the file */ protected function getData($path) { $data = $this->storage->getMetaData($path); if (is_null($data)) { \OC::$server->get(LoggerInterface::class)->debug("!!! Path '$path' is not accessible or present !!!", ['app' => 'core']); } return $data; } /** * scan a single file and store it in the cache * * @param string $file * @param int $reuseExisting * @param int $parentId * @param array|null|false $cacheData existing data in the cache for the file to be scanned * @param bool $lock set to false to disable getting an additional read lock during scanning * @param null $data the metadata for the file, as returned by the storage * @return array|null an array of metadata of the scanned file * @throws \OCP\Lock\LockedException */ public function scanFile($file, $reuseExisting = 0, $parentId = -1, $cacheData = null, $lock = true, $data = null) { if ($file !== '') { try { $this->storage->verifyPath(dirname($file), basename($file)); } catch (\Exception $e) { return null; } } // only proceed if $file is not a partial file, blacklist is handled by the storage if (!self::isPartialFile($file)) { // acquire a lock if ($lock) { if ($this->storage->instanceOfStorage('\OCP\Files\Storage\ILockingStorage')) { $this->storage->acquireLock($file, ILockingProvider::LOCK_SHARED, $this->lockingProvider); } } try { $data = $data ?? $this->getData($file); } catch (ForbiddenException $e) { if ($lock) { if ($this->storage->instanceOfStorage('\OCP\Files\Storage\ILockingStorage')) { $this->storage->releaseLock($file, ILockingProvider::LOCK_SHARED, $this->lockingProvider); } } return null; } try { if ($data) { // pre-emit only if it was a file. By that we avoid counting/treating folders as files if ($data['mimetype'] !== 'httpd/unix-directory') { $this->emit('\OC\Files\Cache\Scanner', 'scanFile', [$file, $this->storageId]); \OC_Hook::emit('\OC\Files\Cache\Scanner', 'scan_file', ['path' => $file, 'storage' => $this->storageId]); } $parent = dirname($file); if ($parent === '.' || $parent === '/') { $parent = ''; } if ($parentId === -1) { $parentId = $this->cache->getParentId($file); } // scan the parent if it's not in the cache (id -1) and the current file is not the root folder if ($file && $parentId === -1) { $parentData = $this->scanFile($parent); if (!$parentData) { return null; } $parentId = $parentData['fileid']; } if ($parent) { $data['parent'] = $parentId; } if (is_null($cacheData)) { /** @var CacheEntry $cacheData */ $cacheData = $this->cache->get($file); } if ($cacheData && $reuseExisting && isset($cacheData['fileid'])) { // prevent empty etag $etag = empty($cacheData['etag']) ? $data['etag'] : $cacheData['etag']; $fileId = $cacheData['fileid']; $data['fileid'] = $fileId; // only reuse data if the file hasn't explicitly changed $mtimeUnchanged = isset($data['storage_mtime']) && isset($cacheData['storage_mtime']) && $data['storage_mtime'] === $cacheData['storage_mtime']; // if the folder is marked as unscanned, never reuse etags if ($mtimeUnchanged && $cacheData['size'] !== -1) { $data['mtime'] = $cacheData['mtime']; if (($reuseExisting & self::REUSE_SIZE) && ($data['size'] === -1)) { $data['size'] = $cacheData['size']; } if ($reuseExisting & self::REUSE_ETAG && !$this->storage->instanceOfStorage(IReliableEtagStorage::class)) { $data['etag'] = $etag; } } // we only updated unencrypted_size if it's already set if ($cacheData['unencrypted_size'] === 0) { unset($data['unencrypted_size']); } // Only update metadata that has changed // i.e. get all the values in $data that are not present in the cache already $newData = $this->array_diff_assoc_multi($data, $cacheData->getData()); // make it known to the caller that etag has been changed and needs propagation if (isset($newData['etag'])) { $data['etag_changed'] = true; } } else { // we only updated unencrypted_size if it's already set unset($data['unencrypted_size']); $newData = $data; $fileId = -1; } if (!empty($newData)) { // Reset the checksum if the data has changed $newData['checksum'] = ''; $newData['parent'] = $parentId; $data['fileid'] = $this->addToCache($file, $newData, $fileId); } $data['oldSize'] = ($cacheData && isset($cacheData['size'])) ? $cacheData['size'] : 0; if ($cacheData && isset($cacheData['encrypted'])) { $data['encrypted'] = $cacheData['encrypted']; } // post-emit only if it was a file. By that we avoid counting/treating folders as files if ($data['mimetype'] !== 'httpd/unix-directory') { $this->emit('\OC\Files\Cache\Scanner', 'postScanFile', [$file, $this->storageId]); \OC_Hook::emit('\OC\Files\Cache\Scanner', 'post_scan_file', ['path' => $file, 'storage' => $this->storageId]); } } else { $this->removeFromCache($file); } } catch (\Exception $e) { if ($lock) { if ($this->storage->instanceOfStorage('\OCP\Files\Storage\ILockingStorage')) { $this->storage->releaseLock($file, ILockingProvider::LOCK_SHARED, $this->lockingProvider); } } throw $e; } // release the acquired lock if ($lock) { if ($this->storage->instanceOfStorage('\OCP\Files\Storage\ILockingStorage')) { $this->storage->releaseLock($file, ILockingProvider::LOCK_SHARED, $this->lockingProvider); } } if ($data && !isset($data['encrypted'])) { $data['encrypted'] = false; } return $data; } return null; } protected function removeFromCache($path) { \OC_Hook::emit('Scanner', 'removeFromCache', ['file' => $path]); $this->emit('\OC\Files\Cache\Scanner', 'removeFromCache', [$path]); if ($this->cacheActive) { $this->cache->remove($path); } } /** * @param string $path * @param array $data * @param int $fileId * @return int the id of the added file */ protected function addToCache($path, $data, $fileId = -1) { if (isset($data['scan_permissions'])) { $data['permissions'] = $data['scan_permissions']; } \OC_Hook::emit('Scanner', 'addToCache', ['file' => $path, 'data' => $data]); $this->emit('\OC\Files\Cache\Scanner', 'addToCache', [$path, $this->storageId, $data, $fileId]); if ($this->cacheActive) { if ($fileId !== -1) { $this->cache->update($fileId, $data); return $fileId; } else { return $this->cache->insert($path, $data); } } else { return -1; } } /** * @param string $path * @param array $data * @param int $fileId */ protected function updateCache($path, $data, $fileId = -1) { \OC_Hook::emit('Scanner', 'addToCache', ['file' => $path, 'data' => $data]); $this->emit('\OC\Files\Cache\Scanner', 'updateCache', [$path, $this->storageId, $data]); if ($this->cacheActive) { if ($fileId !== -1) { $this->cache->update($fileId, $data); } else { $this->cache->put($path, $data); } } } /** * scan a folder and all it's children * * @param string $path * @param bool $recursive * @param int $reuse * @param bool $lock set to false to disable getting an additional read lock during scanning * @return array|null an array of the meta data of the scanned file or folder */ public function scan($path, $recursive = self::SCAN_RECURSIVE, $reuse = -1, $lock = true) { if ($reuse === -1) { $reuse = ($recursive === self::SCAN_SHALLOW) ? self::REUSE_ETAG | self::REUSE_SIZE : self::REUSE_ETAG; } if ($lock) { if ($this->storage->instanceOfStorage('\OCP\Files\Storage\ILockingStorage')) { $this->storage->acquireLock('scanner::' . $path, ILockingProvider::LOCK_EXCLUSIVE, $this->lockingProvider); $this->storage->acquireLock($path, ILockingProvider::LOCK_SHARED, $this->lockingProvider); } } try { try { $data = $this->scanFile($path, $reuse, -1, null, $lock); if ($data && $data['mimetype'] === 'httpd/unix-directory') { $size = $this->scanChildren($path, $recursive, $reuse, $data['fileid'], $lock, $data['size']); $data['size'] = $size; } } catch (NotFoundException $e) { $this->removeFromCache($path); return null; } } finally { if ($lock) { if ($this->storage->instanceOfStorage('\OCP\Files\Storage\ILockingStorage')) { $this->storage->releaseLock($path, ILockingProvider::LOCK_SHARED, $this->lockingProvider); $this->storage->releaseLock('scanner::' . $path, ILockingProvider::LOCK_EXCLUSIVE, $this->lockingProvider); } } } return $data; } /** * Compares $array1 against $array2 and returns all the values in $array1 that are not in $array2 * Note this is a one-way check - i.e. we don't care about things that are in $array2 that aren't in $array1 * * Supports multi-dimensional arrays * Also checks keys/indexes * Comparisons are strict just like array_diff_assoc * Order of keys/values does not matter * * @param array $array1 * @param array $array2 * @return array with the differences between $array1 and $array1 * @throws \InvalidArgumentException if $array1 isn't an actual array * */ protected function array_diff_assoc_multi(array $array1, array $array2) { $result = []; foreach ($array1 as $key => $value) { // if $array2 doesn't have the same key, that's a result if (!array_key_exists($key, $array2)) { $result[$key] = $value; continue; } // if $array2's value for the same key is different, that's a result if ($array2[$key] !== $value && !is_array($value)) { $result[$key] = $value; continue; } if (is_array($value)) { $nestedDiff = $this->array_diff_assoc_multi($value, $array2[$key]); if (!empty($nestedDiff)) { $result[$key] = $nestedDiff; continue; } } } return $result; } /** * Get the children currently in the cache * * @param int $folderId * @return array[] */ protected function getExistingChildren($folderId) { $existingChildren = []; $children = $this->cache->getFolderContentsById($folderId); foreach ($children as $child) { $existingChildren[$child['name']] = $child; } return $existingChildren; } /** * scan all the files and folders in a folder * * @param string $path * @param bool|IScanner::SCAN_RECURSIVE_INCOMPLETE $recursive * @param int $reuse a combination of self::REUSE_* * @param int $folderId id for the folder to be scanned * @param bool $lock set to false to disable getting an additional read lock during scanning * @param int|float $oldSize the size of the folder before (re)scanning the children * @return int|float the size of the scanned folder or -1 if the size is unknown at this stage */ protected function scanChildren(string $path, $recursive, int $reuse, int $folderId, bool $lock, int|float $oldSize, &$etagChanged = false) { if ($reuse === -1) { $reuse = ($recursive === self::SCAN_SHALLOW) ? self::REUSE_ETAG | self::REUSE_SIZE : self::REUSE_ETAG; } $this->emit('\OC\Files\Cache\Scanner', 'scanFolder', [$path, $this->storageId]); $size = 0; $childQueue = $this->handleChildren($path, $recursive, $reuse, $folderId, $lock, $size, $etagChanged); foreach ($childQueue as $child => [$childId, $childSize]) { // "etag changed" propagates up, but not down, so we pass `false` to the children even if we already know that the etag of the current folder changed $childEtagChanged = false; $childSize = $this->scanChildren($child, $recursive, $reuse, $childId, $lock, $childSize, $childEtagChanged); $etagChanged |= $childEtagChanged; if ($childSize === -1) { $size = -1; } elseif ($size !== -1) { $size += $childSize; } } // for encrypted storages, we trigger a regular folder size calculation instead of using the calculated size // to make sure we also updated the unencrypted-size where applicable if ($this->storage->instanceOfStorage(Encryption::class)) { $this->cache->calculateFolderSize($path); } else { if ($this->cacheActive) { $updatedData = []; if ($oldSize !== $size) { $updatedData['size'] = $size; } if ($etagChanged) { $updatedData['etag'] = uniqid(); } if ($updatedData) { $this->cache->update($folderId, $updatedData); } } } $this->emit('\OC\Files\Cache\Scanner', 'postScanFolder', [$path, $this->storageId]); return $size; } /** * @param bool|IScanner::SCAN_RECURSIVE_INCOMPLETE $recursive */ private function handleChildren(string $path, $recursive, int $reuse, int $folderId, bool $lock, int|float &$size, bool &$etagChanged): array { // we put this in it's own function so it cleans up the memory before we start recursing $existingChildren = $this->getExistingChildren($folderId); $newChildren = iterator_to_array($this->storage->getDirectoryContent($path)); if (count($existingChildren) === 0 && count($newChildren) === 0) { // no need to do a transaction return []; } if ($this->useTransactions) { $this->connection->beginTransaction(); } $exceptionOccurred = false; $childQueue = []; $newChildNames = []; foreach ($newChildren as $fileMeta) { $permissions = $fileMeta['scan_permissions'] ?? $fileMeta['permissions']; if ($permissions === 0) { continue; } $originalFile = $fileMeta['name']; $file = trim(\OC\Files\Filesystem::normalizePath($originalFile), '/'); if (trim($originalFile, '/') !== $file) { // encoding mismatch, might require compatibility wrapper \OC::$server->get(LoggerInterface::class)->debug('Scanner: Skipping non-normalized file name "'. $originalFile . '" in path "' . $path . '".', ['app' => 'core']); $this->emit('\OC\Files\Cache\Scanner', 'normalizedNameMismatch', [$path ? $path . '/' . $originalFile : $originalFile]); // skip this entry continue; } $newChildNames[] = $file; $child = $path ? $path . '/' . $file : $file; try { $existingData = $existingChildren[$file] ?? false; $data = $this->scanFile($child, $reuse, $folderId, $existingData, $lock, $fileMeta); if ($data) { if ($data['mimetype'] === 'httpd/unix-directory' && $recursive === self::SCAN_RECURSIVE) { $childQueue[$child] = [$data['fileid'], $data['size']]; } elseif ($data['mimetype'] === 'httpd/unix-directory' && $recursive === self::SCAN_RECURSIVE_INCOMPLETE && $data['size'] === -1) { // only recurse into folders which aren't fully scanned $childQueue[$child] = [$data['fileid'], $data['size']]; } elseif ($data['size'] === -1) { $size = -1; } elseif ($size !== -1) { $size += $data['size']; } if (isset($data['etag_changed']) && $data['etag_changed']) { $etagChanged = true; } } } catch (Exception $ex) { // might happen if inserting duplicate while a scanning // process is running in parallel // log and ignore if ($this->useTransactions) { $this->connection->rollback(); $this->connection->beginTransaction(); } \OC::$server->get(LoggerInterface::class)->debug('Exception while scanning file "' . $child . '"', [ 'app' => 'core', 'exception' => $ex, ]); $exceptionOccurred = true; } catch (\OCP\Lock\LockedException $e) { if ($this->useTransactions) { $this->connection->rollback(); } throw $e; } } $removedChildren = \array_diff(array_keys($existingChildren), $newChildNames); foreach ($removedChildren as $childName) { $child = $path ? $path . '/' . $childName : $childName; $this->removeFromCache($child); } if ($this->useTransactions) { $this->connection->commit(); } if ($exceptionOccurred) { // It might happen that the parallel scan process has already // inserted mimetypes but those weren't available yet inside the transaction // To make sure to have the updated mime types in such cases, // we reload them here \OC::$server->getMimeTypeLoader()->reset(); } return $childQueue; } /** * check if the file should be ignored when scanning * NOTE: files with a '.part' extension are ignored as well! * prevents unfinished put requests to be scanned * * @param string $file * @return boolean */ public static function isPartialFile($file) { if (pathinfo($file, PATHINFO_EXTENSION) === 'part') { return true; } if (str_contains($file, '.part/')) { return true; } return false; } /** * walk over any folders that are not fully scanned yet and scan them */ public function backgroundScan() { if ($this->storage->instanceOfStorage(Jail::class)) { // for jail storage wrappers (shares, groupfolders) we run the background scan on the source storage // this is mainly done because the jail wrapper doesn't implement `getIncomplete` (because it would be inefficient). // // Running the scan on the source storage might scan more than "needed", but the unscanned files outside the jail will // have to be scanned at some point anyway. $unJailedScanner = $this->storage->getUnjailedStorage()->getScanner(); $unJailedScanner->backgroundScan(); } else { if (!$this->cache->inCache('')) { // if the storage isn't in the cache yet, just scan the root completely $this->runBackgroundScanJob(function () { $this->scan('', self::SCAN_RECURSIVE, self::REUSE_ETAG); }, ''); } else { $lastPath = null; // find any path marked as unscanned and run the scanner until no more paths are unscanned (or we get stuck) while (($path = $this->cache->getIncomplete()) !== false && $path !== $lastPath) { $this->runBackgroundScanJob(function () use ($path) { $this->scan($path, self::SCAN_RECURSIVE_INCOMPLETE, self::REUSE_ETAG | self::REUSE_SIZE); }, $path); // FIXME: this won't proceed with the next item, needs revamping of getIncomplete() // to make this possible $lastPath = $path; } } } } protected function runBackgroundScanJob(callable $callback, $path) { try { $callback(); \OC_Hook::emit('Scanner', 'correctFolderSize', ['path' => $path]); if ($this->cacheActive && $this->cache instanceof Cache) { $this->cache->correctFolderSize($path, null, true); } } catch (\OCP\Files\StorageInvalidException $e) { // skip unavailable storages } catch (\OCP\Files\StorageNotAvailableException $e) { // skip unavailable storages } catch (\OCP\Files\ForbiddenException $e) { // skip forbidden storages } catch (\OCP\Lock\LockedException $e) { // skip unavailable storages } } /** * Set whether the cache is affected by scan operations * * @param boolean $active The active state of the cache */ public function setCacheActive($active) { $this->cacheActive = $active; } }