diff --git a/CHANGELOG.md b/CHANGELOG.md index 185cb1e..2ca96b8 100644 --- a/CHANGELOG.md +++ b/CHANGELOG.md @@ -1,6 +1,8 @@ ## spacesavers2 development version - Move the report to a separate internal repository (#79, @kelly-sovacool) +- new option `--json` for `spacesavers2_pdq` +- `redirect` script now checks if running on BIOWULF or FRCE. If not, then checks for python version and "xxhash" library (fix #91, @kopardev) ### New features diff --git a/bin/redirect b/bin/redirect index 33974ed..c7eda4b 100755 --- a/bin/redirect +++ b/bin/redirect @@ -1,12 +1,6 @@ #!/usr/bin/env bash -# This is a wrapper for spook python script -# It: -# - loads the appropriate conda environment -# - parses the variables directly to the python script -# - deactivate enrivonment - -SCRIPTNAME="$0" +SCRIPTNAME="$BASH_SOURCE" SCRIPTDIRNAME=$(readlink -f $(dirname "$SCRIPTNAME")) # add "bin" to PATH @@ -16,11 +10,46 @@ fi TOOLDIR=$(dirname "$SCRIPTDIRNAME") TOOLNAME=$(basename "$SCRIPTNAME") -# echo $TOOLNAME -# load appropriate conda env -. "/data/CCBR_Pipeliner/db/PipeDB/Conda/etc/profile.d/conda.sh" && conda activate py311 +# find out if you are running on biowulf or frce +nbiowulf=$(scontrol show config 2>/dev/null | grep -i -c biowulf) +if [[ "$nbiowulf" > 0 ]];then ISBIOWULF=true; else ISBIOWULF=false;fi +nfrce=$(scontrol show config 2>/dev/null | grep -i -c fsitgl) +if [[ "$nfrce" > 0 ]];then ISFRCE=true; else ISFRCE=false;fi +if [[ "$HOSTNAME" == "helix.nih.gov" ]];then ISBIOWULF=true; else ISBIOWULF=false;fi -${TOOLDIR}/${TOOLNAME} "$@" || true +# load conda +run=1 +if [[ $ISBIOWULF == true ]];then + . "/data/CCBR_Pipeliner/db/PipeDB/Conda/etc/profile.d/conda.sh" + conda activate py311 + run=0 +elif [[ $ISFRCE == true ]];then + . "/mnt/projects/CCBR-Pipelines/resources/miniconda3/etc/profile.d/conda.sh" + conda activate py311 + run=0 +else + echo "You are NOT running on BIOWULF or on FRCE" + version=$(python --version 2>/dev/null | awk '{print $NF}') + major=$(echo $version|awk -F"." '{print $1}') + minor=$(echo $version|awk -F"." '{print $2}') + if [[ "$major" -eq "3" ]]; then + if [[ "$minor" -ge "11" ]];then + echo "Satisfatory version of Python available: $version" + xxhashfound=$(python -c "import xxhash" 2> /dev/null; echo $?) + if [[ "$xxhashfound" -eq "0" ]];then + echo "xxhash python library present!" + run=0 + fi + fi + fi + if [[ "$run" != "0" ]];then + echo "Please make sure that:" + echo "Python version >= 3.11 with xxhash library is required!" + fi +fi -conda deactivate +if [[ "$run" == "0" ]]; then + ${TOOLDIR}/${TOOLNAME} "$@" || true + conda deactivate 2>/dev/null +fi diff --git a/docs/pdq.md b/docs/pdq.md index 8f0bd45..c22675b 100644 --- a/docs/pdq.md +++ b/docs/pdq.md @@ -13,12 +13,13 @@ It is quick tool to gather datapoints to monitor filesystem usage. Typically, ca ### Inputs - `--folder`: Path to the folder to run `spacesavers2_pdq` on. - `--threads`: `spacesavers2_pdq` uses multiprocessing library to parallelize orchestration. This defines the number of threads to run in parallel. - - `--outfile`: If not supplied then the optput is written to the screen. + - `--outfile`: If not supplied then the output is written to the screen. + - `--json`: Optional, if provided output is also written in JSON format. > NOTE: `spacesavers2_pdq` reports errors (eg. cannot read file) to STDERR ```bash -usage: spacesavers2_pdq [-h] -f FOLDER [-p THREADS] [-o OUTFILE] [-v] +usage: spacesavers2_pdq [-h] -f FOLDER [-p THREADS] [-o OUTFILE] [-j JSON] [-v] spacesavers2_pdq: get quick per user info (number of files and bytes). @@ -29,11 +30,12 @@ options: -p THREADS, --threads THREADS number of threads to be used (default 4) -o OUTFILE, --outfile OUTFILE - outfile ... catalog file .. by default output is printed to screen + outfile ... by default output is printed to screen + -j JSON, --json JSON outfile file in JSON format. -v, --version show program's version number and exit Version: - v0.11.5 + v0.11.6 Example: > spacesavers2_pdq -f /path/to/folder -p 4 -o /path/to/output_file ``` @@ -58,3 +60,24 @@ The 3 items in the line are as follows: | 1 | username | "user1" | | 2 | total no. of files owned | 1386138 | | 3 | total no. of bytes occupied | 6089531321856 | + +## JSON output + +Here is an example output: + +``` +{ + "/data/CCBR_Pipeliner/Tools/spacesavers2": { + "37513": { + "username": "kopardevn", + "nfiles": 1267, + "nbytes": 96084992 + }, + "60731": { + "username": "sovacoolkl", + "nfiles": 895, + "nbytes": 89249280 + } + } +} +``` \ No newline at end of file diff --git a/spacesavers2_pdq b/spacesavers2_pdq index d1885da..93c521e 100755 --- a/spacesavers2_pdq +++ b/spacesavers2_pdq @@ -15,6 +15,7 @@ from src.pdq import pdq from multiprocessing import Pool import argparse from pathlib import Path +import json def task(f): @@ -62,7 +63,15 @@ def main(): dest="outfile", required=False, type=str, - help="outfile ... catalog file .. by default output is printed to screen", + help="outfile ... by default output is printed to screen", + ) + parser.add_argument( + "-j", + "--json", + dest="json", + required=False, + type=str, + help="outfile file in JSON format.", ) parser.add_argument("-v", "--version", action="version", version=__version__) @@ -70,7 +79,7 @@ def main(): args = parser.parse_args() folder = args.folder - p = Path(folder) + p = Path(folder).absolute() files = [p] files2 = p.glob("**/*") files.extend(files2) @@ -80,6 +89,9 @@ def main(): else: outfh = sys.stdout + if args.json: + outjson = open(args.json, 'w') + bigdict=dict() with Pool(processes=args.threads) as pool: @@ -89,15 +101,26 @@ def main(): if not uid in bigdict: bigdict[uid]=dict() inode = fd.get_inode() if not inode in bigdict[uid]: bigdict[uid][inode]=fd.get_size() - + + outdict=dict() + outdict[str(p)]=dict() + for uid in bigdict.keys(): username = get_username_groupname(uid) + outdict[str(p)][str(uid)]=dict() nfiles = len(bigdict[uid]) nbytes = 0 for inode in bigdict[uid].keys(): nbytes += bigdict[uid][inode] + outdict[str(p)][str(uid)]['username']=username + outdict[str(p)][str(uid)]['nfiles']=nfiles + outdict[str(p)][str(uid)]['nbytes']=nbytes outfh.write(f"{username}\t{nfiles}\t{nbytes}\n") + if args.json: + json.dump(outdict,outjson,indent=1) + outjson.close() + if args.outfile: outfh.close() diff --git a/src/VERSION b/src/VERSION index 62d5dbd..e5cbde3 100644 --- a/src/VERSION +++ b/src/VERSION @@ -1 +1 @@ -0.11.5 +0.11.6