From d6a45e5c9f868515c2ac515466f3cdc232aaa6d7 Mon Sep 17 00:00:00 2001 From: Andreas Gohr Date: Wed, 13 Mar 2024 12:09:34 +0100 Subject: [PATCH] ignore typical temporary files when listing directories Operating systems love to cluter the file system with all kinds of cruft. This adds a gitignore like config to skip those files when listing files. --- Crawler.php | 37 +++++++++++++ _test/filelistdata/~$ignoreme.docx | 0 conf/ignore.txt | 85 ++++++++++++++++++++++++++++++ 3 files changed, 122 insertions(+) create mode 100644 _test/filelistdata/~$ignoreme.docx create mode 100644 conf/ignore.txt diff --git a/Crawler.php b/Crawler.php index e8d4ccb..09e5806 100644 --- a/Crawler.php +++ b/Crawler.php @@ -13,6 +13,9 @@ class Crawler /** @var bool */ protected $sortreverse = false; + /** @var string[] patterns to ignore */ + protected $ignore = []; + /** * Initializes the crawler * @@ -24,6 +27,8 @@ public function __construct($extensions) $this->ext = array_map('trim', $this->ext); $this->ext = array_map('preg_quote_cb', $this->ext); $this->ext = implode('|', $this->ext); + + $this->ignore = $this->loadIgnores(); } public function setSortBy($sortby) @@ -67,6 +72,9 @@ public function crawl($root, $local, $pattern, $recursive, $titlefile) if (!is_dir($filepath) && !$this->isExtensionAllowed($file)) { continue; } + if ($this->isFileIgnored($file)) { + continue; + } // get title file $filename = $file; @@ -141,6 +149,35 @@ protected function isExtensionAllowed($file) return preg_match('/(' . $this->ext . ')$/i', $file); } + /** + * Check if a file is ignored by the ignore patterns + * + * @param string $file + * @return bool + */ + protected function isFileIgnored($file) + { + foreach ($this->ignore as $pattern) { + if ($this->fnmatch($pattern, $file)) return true; + } + return false; + } + + /** + * Load the ignore patterns from the ignore.txt file + * + * @return string[] + */ + protected function loadIgnores() + { + $file = __DIR__ . '/conf/ignore.txt'; + $ignore = file($file, FILE_IGNORE_NEW_LINES | FILE_SKIP_EMPTY_LINES); + $ignore = array_map(function ($line) { + return trim(preg_replace('/\s*#.*$/', '', $line)); + }, $ignore); + $ignore = array_filter($ignore); + return $ignore; + } /** * Replacement for fnmatch() for windows systems. diff --git a/_test/filelistdata/~$ignoreme.docx b/_test/filelistdata/~$ignoreme.docx new file mode 100644 index 0000000..e69de29 diff --git a/conf/ignore.txt b/conf/ignore.txt new file mode 100644 index 0000000..869b332 --- /dev/null +++ b/conf/ignore.txt @@ -0,0 +1,85 @@ +# This is a gitignore style file to ignore typical temporary files and directories + +### Linux ### +*~ + +# temporary files which can be created if a process still has a handle open of a deleted file +.fuse_hidden* + +# KDE directory preferences +.directory + +# Linux trash folder which might appear on any partition or disk +.Trash-* + +# .nfs files are created when an open file is removed but is still being accessed +.nfs* + +### macOS ### +# General +.DS_Store +.AppleDouble +.LSOverride + + +# Thumbnails +._* + +# Files that might appear in the root of a volume +.DocumentRevisions-V100 +.fseventsd +.Spotlight-V100 +.TemporaryItems +.Trashes +.VolumeIcon.icns +.com.apple.timemachine.donotpresent + +# Directories potentially created on remote AFP share +.AppleDB +.AppleDesktop +Network Trash Folder +Temporary Items +.apdisk + +# iCloud generated files +*.icloud + +### MicrosoftOffice ### +*.tmp + +# Word temporary +~$*.doc* + +# Word Auto Backup File +Backup of *.doc* + +# Excel temporary +~$*.xls* + +# Excel Backup File +*.xlk + +# PowerPoint temporary +~$*.ppt* + +# Visio autosave temporary files +*.~vsd* + +### Windows ### +# Windows thumbnail cache files +Thumbs.db +Thumbs.db:encryptable +ehthumbs.db +ehthumbs_vista.db + +# Dump file +*.stackdump + +# Folder config file +[Dd]esktop.ini + +# Recycle Bin used on file shares +$RECYCLE.BIN + +# Windows shortcuts +*.lnk