From c511efb52daca87bbc641e58a85a17d259c63f46 Mon Sep 17 00:00:00 2001 From: Florent Yvon Date: Tue, 20 Aug 2024 19:53:40 +0100 Subject: [PATCH] Added legacy pgscatalog_utils scoring file validator to pygscatalog.validate --- pgscatalog.validate/LICENSE | 201 ++++++++ pgscatalog.validate/README.md | 10 + pgscatalog.validate/poetry.lock | 214 +++++++++ pgscatalog.validate/poetry.toml | 3 + pgscatalog.validate/pyproject.toml | 29 ++ .../src/pgscatalog/validate/__init__.py | 0 .../src/pgscatalog/validate/cli/__init__.py | 0 .../validate/cli/validate_scorefile.py | 171 +++++++ .../src/pgscatalog/validate/lib/__init__.py | 0 .../validate/lib/common_constants.py | 44 ++ .../validate/lib/formatted/__init__.py | 0 .../validate/lib/formatted/validator.py | 197 ++++++++ .../lib/harmonized_position/__init__.py | 0 .../lib/harmonized_position/validator.py | 98 ++++ .../src/pgscatalog/validate/lib/helpers.py | 29 ++ .../src/pgscatalog/validate/lib/schemas.py | 157 +++++++ .../validate/lib/validate_scorefile.py | 171 +++++++ .../pgscatalog/validate/lib/validator_base.py | 429 ++++++++++++++++++ 18 files changed, 1753 insertions(+) create mode 100644 pgscatalog.validate/LICENSE create mode 100644 pgscatalog.validate/README.md create mode 100644 pgscatalog.validate/poetry.lock create mode 100644 pgscatalog.validate/poetry.toml create mode 100644 pgscatalog.validate/pyproject.toml create mode 100644 pgscatalog.validate/src/pgscatalog/validate/__init__.py create mode 100644 pgscatalog.validate/src/pgscatalog/validate/cli/__init__.py create mode 100644 pgscatalog.validate/src/pgscatalog/validate/cli/validate_scorefile.py create mode 100644 pgscatalog.validate/src/pgscatalog/validate/lib/__init__.py create mode 100644 pgscatalog.validate/src/pgscatalog/validate/lib/common_constants.py create mode 100644 pgscatalog.validate/src/pgscatalog/validate/lib/formatted/__init__.py create mode 100644 pgscatalog.validate/src/pgscatalog/validate/lib/formatted/validator.py create mode 100644 pgscatalog.validate/src/pgscatalog/validate/lib/harmonized_position/__init__.py create mode 100644 pgscatalog.validate/src/pgscatalog/validate/lib/harmonized_position/validator.py create mode 100644 pgscatalog.validate/src/pgscatalog/validate/lib/helpers.py create mode 100644 pgscatalog.validate/src/pgscatalog/validate/lib/schemas.py create mode 100644 pgscatalog.validate/src/pgscatalog/validate/lib/validate_scorefile.py create mode 100644 pgscatalog.validate/src/pgscatalog/validate/lib/validator_base.py diff --git a/pgscatalog.validate/LICENSE b/pgscatalog.validate/LICENSE new file mode 100644 index 0000000..261eeb9 --- /dev/null +++ b/pgscatalog.validate/LICENSE @@ -0,0 +1,201 @@ + Apache License + Version 2.0, January 2004 + http://www.apache.org/licenses/ + + TERMS AND CONDITIONS FOR USE, REPRODUCTION, AND DISTRIBUTION + + 1. Definitions. + + "License" shall mean the terms and conditions for use, reproduction, + and distribution as defined by Sections 1 through 9 of this document. + + "Licensor" shall mean the copyright owner or entity authorized by + the copyright owner that is granting the License. + + "Legal Entity" shall mean the union of the acting entity and all + other entities that control, are controlled by, or are under common + control with that entity. For the purposes of this definition, + "control" means (i) the power, direct or indirect, to cause the + direction or management of such entity, whether by contract or + otherwise, or (ii) ownership of fifty percent (50%) or more of the + outstanding shares, or (iii) beneficial ownership of such entity. + + "You" (or "Your") shall mean an individual or Legal Entity + exercising permissions granted by this License. + + "Source" form shall mean the preferred form for making modifications, + including but not limited to software source code, documentation + source, and configuration files. + + "Object" form shall mean any form resulting from mechanical + transformation or translation of a Source form, including but + not limited to compiled object code, generated documentation, + and conversions to other media types. + + "Work" shall mean the work of authorship, whether in Source or + Object form, made available under the License, as indicated by a + copyright notice that is included in or attached to the work + (an example is provided in the Appendix below). + + "Derivative Works" shall mean any work, whether in Source or Object + form, that is based on (or derived from) the Work and for which the + editorial revisions, annotations, elaborations, or other modifications + represent, as a whole, an original work of authorship. For the purposes + of this License, Derivative Works shall not include works that remain + separable from, or merely link (or bind by name) to the interfaces of, + the Work and Derivative Works thereof. + + "Contribution" shall mean any work of authorship, including + the original version of the Work and any modifications or additions + to that Work or Derivative Works thereof, that is intentionally + submitted to Licensor for inclusion in the Work by the copyright owner + or by an individual or Legal Entity authorized to submit on behalf of + the copyright owner. For the purposes of this definition, "submitted" + means any form of electronic, verbal, or written communication sent + to the Licensor or its representatives, including but not limited to + communication on electronic mailing lists, source code control systems, + and issue tracking systems that are managed by, or on behalf of, the + Licensor for the purpose of discussing and improving the Work, but + excluding communication that is conspicuously marked or otherwise + designated in writing by the copyright owner as "Not a Contribution." + + "Contributor" shall mean Licensor and any individual or Legal Entity + on behalf of whom a Contribution has been received by Licensor and + subsequently incorporated within the Work. + + 2. Grant of Copyright License. Subject to the terms and conditions of + this License, each Contributor hereby grants to You a perpetual, + worldwide, non-exclusive, no-charge, royalty-free, irrevocable + copyright license to reproduce, prepare Derivative Works of, + publicly display, publicly perform, sublicense, and distribute the + Work and such Derivative Works in Source or Object form. + + 3. Grant of Patent License. Subject to the terms and conditions of + this License, each Contributor hereby grants to You a perpetual, + worldwide, non-exclusive, no-charge, royalty-free, irrevocable + (except as stated in this section) patent license to make, have made, + use, offer to sell, sell, import, and otherwise transfer the Work, + where such license applies only to those patent claims licensable + by such Contributor that are necessarily infringed by their + Contribution(s) alone or by combination of their Contribution(s) + with the Work to which such Contribution(s) was submitted. If You + institute patent litigation against any entity (including a + cross-claim or counterclaim in a lawsuit) alleging that the Work + or a Contribution incorporated within the Work constitutes direct + or contributory patent infringement, then any patent licenses + granted to You under this License for that Work shall terminate + as of the date such litigation is filed. + + 4. Redistribution. You may reproduce and distribute copies of the + Work or Derivative Works thereof in any medium, with or without + modifications, and in Source or Object form, provided that You + meet the following conditions: + + (a) You must give any other recipients of the Work or + Derivative Works a copy of this License; and + + (b) You must cause any modified files to carry prominent notices + stating that You changed the files; and + + (c) You must retain, in the Source form of any Derivative Works + that You distribute, all copyright, patent, trademark, and + attribution notices from the Source form of the Work, + excluding those notices that do not pertain to any part of + the Derivative Works; and + + (d) If the Work includes a "NOTICE" text file as part of its + distribution, then any Derivative Works that You distribute must + include a readable copy of the attribution notices contained + within such NOTICE file, excluding those notices that do not + pertain to any part of the Derivative Works, in at least one + of the following places: within a NOTICE text file distributed + as part of the Derivative Works; within the Source form or + documentation, if provided along with the Derivative Works; or, + within a display generated by the Derivative Works, if and + wherever such third-party notices normally appear. The contents + of the NOTICE file are for informational purposes only and + do not modify the License. You may add Your own attribution + notices within Derivative Works that You distribute, alongside + or as an addendum to the NOTICE text from the Work, provided + that such additional attribution notices cannot be construed + as modifying the License. + + You may add Your own copyright statement to Your modifications and + may provide additional or different license terms and conditions + for use, reproduction, or distribution of Your modifications, or + for any such Derivative Works as a whole, provided Your use, + reproduction, and distribution of the Work otherwise complies with + the conditions stated in this License. + + 5. Submission of Contributions. Unless You explicitly state otherwise, + any Contribution intentionally submitted for inclusion in the Work + by You to the Licensor shall be under the terms and conditions of + this License, without any additional terms or conditions. + Notwithstanding the above, nothing herein shall supersede or modify + the terms of any separate license agreement you may have executed + with Licensor regarding such Contributions. + + 6. Trademarks. This License does not grant permission to use the trade + names, trademarks, service marks, or product names of the Licensor, + except as required for reasonable and customary use in describing the + origin of the Work and reproducing the content of the NOTICE file. + + 7. Disclaimer of Warranty. Unless required by applicable law or + agreed to in writing, Licensor provides the Work (and each + Contributor provides its Contributions) on an "AS IS" BASIS, + WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or + implied, including, without limitation, any warranties or conditions + of TITLE, NON-INFRINGEMENT, MERCHANTABILITY, or FITNESS FOR A + PARTICULAR PURPOSE. You are solely responsible for determining the + appropriateness of using or redistributing the Work and assume any + risks associated with Your exercise of permissions under this License. + + 8. Limitation of Liability. In no event and under no legal theory, + whether in tort (including negligence), contract, or otherwise, + unless required by applicable law (such as deliberate and grossly + negligent acts) or agreed to in writing, shall any Contributor be + liable to You for damages, including any direct, indirect, special, + incidental, or consequential damages of any character arising as a + result of this License or out of the use or inability to use the + Work (including but not limited to damages for loss of goodwill, + work stoppage, computer failure or malfunction, or any and all + other commercial damages or losses), even if such Contributor + has been advised of the possibility of such damages. + + 9. Accepting Warranty or Additional Liability. While redistributing + the Work or Derivative Works thereof, You may choose to offer, + and charge a fee for, acceptance of support, warranty, indemnity, + or other liability obligations and/or rights consistent with this + License. However, in accepting such obligations, You may act only + on Your own behalf and on Your sole responsibility, not on behalf + of any other Contributor, and only if You agree to indemnify, + defend, and hold each Contributor harmless for any liability + incurred by, or claims asserted against, such Contributor by reason + of your accepting any such warranty or additional liability. + + END OF TERMS AND CONDITIONS + + APPENDIX: How to apply the Apache License to your work. + + To apply the Apache License to your work, attach the following + boilerplate notice, with the fields enclosed by brackets "[]" + replaced with your own identifying information. (Don't include + the brackets!) The text should be enclosed in the appropriate + comment syntax for the file format. We also recommend that a + file or class name and description of purpose be included on the + same "printed page" as the copyright notice for easier + identification within third-party archives. + + Copyright [yyyy] [name of copyright owner] + + Licensed under the Apache License, Version 2.0 (the "License"); + you may not use this file except in compliance with the License. + You may obtain a copy of the License at + + http://www.apache.org/licenses/LICENSE-2.0 + + Unless required by applicable law or agreed to in writing, software + distributed under the License is distributed on an "AS IS" BASIS, + WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + See the License for the specific language governing permissions and + limitations under the License. diff --git a/pgscatalog.validate/README.md b/pgscatalog.validate/README.md new file mode 100644 index 0000000..32f20b9 --- /dev/null +++ b/pgscatalog.validate/README.md @@ -0,0 +1,10 @@ +# `pgscatalog.validate` + +This Python package contains: + +* CLI applications to check/validate that the scoring files and harmonized scoring files match the PGS Catalog scoring file formats +* library classes and functions for working with scoring file validation + +| Application | Description | Link | +|-----------------------|------------------------|-----------------------| +| `pgscatalog-validate` | Validate scoring files | [README](missing_url) | diff --git a/pgscatalog.validate/poetry.lock b/pgscatalog.validate/poetry.lock new file mode 100644 index 0000000..5db99d6 --- /dev/null +++ b/pgscatalog.validate/poetry.lock @@ -0,0 +1,214 @@ +# This file is automatically @generated by Poetry 1.8.3 and should not be changed by hand. + +[[package]] +name = "numpy" +version = "2.1.0" +description = "Fundamental package for array computing in Python" +optional = false +python-versions = ">=3.10" +files = [ + {file = "numpy-2.1.0-cp310-cp310-macosx_10_9_x86_64.whl", hash = "sha256:6326ab99b52fafdcdeccf602d6286191a79fe2fda0ae90573c5814cd2b0bc1b8"}, + {file = "numpy-2.1.0-cp310-cp310-macosx_11_0_arm64.whl", hash = "sha256:0937e54c09f7a9a68da6889362ddd2ff584c02d015ec92672c099b61555f8911"}, + {file = "numpy-2.1.0-cp310-cp310-macosx_14_0_arm64.whl", hash = "sha256:30014b234f07b5fec20f4146f69e13cfb1e33ee9a18a1879a0142fbb00d47673"}, + {file = "numpy-2.1.0-cp310-cp310-macosx_14_0_x86_64.whl", hash = "sha256:899da829b362ade41e1e7eccad2cf274035e1cb36ba73034946fccd4afd8606b"}, + {file = "numpy-2.1.0-cp310-cp310-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:08801848a40aea24ce16c2ecde3b756f9ad756586fb2d13210939eb69b023f5b"}, + {file = "numpy-2.1.0-cp310-cp310-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:398049e237d1aae53d82a416dade04defed1a47f87d18d5bd615b6e7d7e41d1f"}, + {file = "numpy-2.1.0-cp310-cp310-musllinux_1_1_x86_64.whl", hash = "sha256:0abb3916a35d9090088a748636b2c06dc9a6542f99cd476979fb156a18192b84"}, + {file = "numpy-2.1.0-cp310-cp310-musllinux_1_2_aarch64.whl", hash = "sha256:10e2350aea18d04832319aac0f887d5fcec1b36abd485d14f173e3e900b83e33"}, + {file = "numpy-2.1.0-cp310-cp310-win32.whl", hash = "sha256:f6b26e6c3b98adb648243670fddc8cab6ae17473f9dc58c51574af3e64d61211"}, + {file = "numpy-2.1.0-cp310-cp310-win_amd64.whl", hash = "sha256:f505264735ee074250a9c78247ee8618292091d9d1fcc023290e9ac67e8f1afa"}, + {file = "numpy-2.1.0-cp311-cp311-macosx_10_9_x86_64.whl", hash = "sha256:76368c788ccb4f4782cf9c842b316140142b4cbf22ff8db82724e82fe1205dce"}, + {file = "numpy-2.1.0-cp311-cp311-macosx_14_0_arm64.whl", hash = "sha256:f8e93a01a35be08d31ae33021e5268f157a2d60ebd643cfc15de6ab8e4722eb1"}, + {file = "numpy-2.1.0-cp311-cp311-macosx_14_0_x86_64.whl", hash = "sha256:9523f8b46485db6939bd069b28b642fec86c30909cea90ef550373787f79530e"}, + {file = "numpy-2.1.0-cp311-cp311-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:54139e0eb219f52f60656d163cbe67c31ede51d13236c950145473504fa208cb"}, + {file = "numpy-2.1.0-cp311-cp311-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:f5ebbf9fbdabed208d4ecd2e1dfd2c0741af2f876e7ae522c2537d404ca895c3"}, + {file = "numpy-2.1.0-cp311-cp311-musllinux_1_1_x86_64.whl", hash = "sha256:378cb4f24c7d93066ee4103204f73ed046eb88f9ad5bb2275bb9fa0f6a02bd36"}, + {file = "numpy-2.1.0-cp311-cp311-musllinux_1_2_aarch64.whl", hash = "sha256:d8f699a709120b220dfe173f79c73cb2a2cab2c0b88dd59d7b49407d032b8ebd"}, + {file = "numpy-2.1.0-cp311-cp311-win32.whl", hash = "sha256:ffbd6faeb190aaf2b5e9024bac9622d2ee549b7ec89ef3a9373fa35313d44e0e"}, + {file = "numpy-2.1.0-cp311-cp311-win_amd64.whl", hash = "sha256:0af3a5987f59d9c529c022c8c2a64805b339b7ef506509fba7d0556649b9714b"}, + {file = "numpy-2.1.0-cp312-cp312-macosx_10_9_x86_64.whl", hash = "sha256:fe76d75b345dc045acdbc006adcb197cc680754afd6c259de60d358d60c93736"}, + {file = "numpy-2.1.0-cp312-cp312-macosx_11_0_arm64.whl", hash = "sha256:f358ea9e47eb3c2d6eba121ab512dfff38a88db719c38d1e67349af210bc7529"}, + {file = "numpy-2.1.0-cp312-cp312-macosx_14_0_arm64.whl", hash = "sha256:dd94ce596bda40a9618324547cfaaf6650b1a24f5390350142499aa4e34e53d1"}, + {file = "numpy-2.1.0-cp312-cp312-macosx_14_0_x86_64.whl", hash = "sha256:b47c551c6724960479cefd7353656498b86e7232429e3a41ab83be4da1b109e8"}, + {file = "numpy-2.1.0-cp312-cp312-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:a0756a179afa766ad7cb6f036de622e8a8f16ffdd55aa31f296c870b5679d745"}, + {file = "numpy-2.1.0-cp312-cp312-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:24003ba8ff22ea29a8c306e61d316ac74111cebf942afbf692df65509a05f111"}, + {file = "numpy-2.1.0-cp312-cp312-musllinux_1_1_x86_64.whl", hash = "sha256:b34fa5e3b5d6dc7e0a4243fa0f81367027cb6f4a7215a17852979634b5544ee0"}, + {file = "numpy-2.1.0-cp312-cp312-musllinux_1_2_aarch64.whl", hash = "sha256:c4f982715e65036c34897eb598d64aef15150c447be2cfc6643ec7a11af06574"}, + {file = "numpy-2.1.0-cp312-cp312-win32.whl", hash = "sha256:c4cd94dfefbefec3f8b544f61286584292d740e6e9d4677769bc76b8f41deb02"}, + {file = "numpy-2.1.0-cp312-cp312-win_amd64.whl", hash = "sha256:a0cdef204199278f5c461a0bed6ed2e052998276e6d8ab2963d5b5c39a0500bc"}, + {file = "numpy-2.1.0-cp313-cp313-macosx_10_13_x86_64.whl", hash = "sha256:8ab81ccd753859ab89e67199b9da62c543850f819993761c1e94a75a814ed667"}, + {file = "numpy-2.1.0-cp313-cp313-macosx_11_0_arm64.whl", hash = "sha256:442596f01913656d579309edcd179a2a2f9977d9a14ff41d042475280fc7f34e"}, + {file = "numpy-2.1.0-cp313-cp313-macosx_14_0_arm64.whl", hash = "sha256:848c6b5cad9898e4b9ef251b6f934fa34630371f2e916261070a4eb9092ffd33"}, + {file = "numpy-2.1.0-cp313-cp313-macosx_14_0_x86_64.whl", hash = "sha256:54c6a63e9d81efe64bfb7bcb0ec64332a87d0b87575f6009c8ba67ea6374770b"}, + {file = "numpy-2.1.0-cp313-cp313-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:652e92fc409e278abdd61e9505649e3938f6d04ce7ef1953f2ec598a50e7c195"}, + {file = "numpy-2.1.0-cp313-cp313-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:0ab32eb9170bf8ffcbb14f11613f4a0b108d3ffee0832457c5d4808233ba8977"}, + {file = "numpy-2.1.0-cp313-cp313-musllinux_1_1_x86_64.whl", hash = "sha256:8fb49a0ba4d8f41198ae2d52118b050fd34dace4b8f3fb0ee34e23eb4ae775b1"}, + {file = "numpy-2.1.0-cp313-cp313-musllinux_1_2_aarch64.whl", hash = "sha256:44e44973262dc3ae79e9063a1284a73e09d01b894b534a769732ccd46c28cc62"}, + {file = "numpy-2.1.0-cp313-cp313-win32.whl", hash = "sha256:ab83adc099ec62e044b1fbb3a05499fa1e99f6d53a1dde102b2d85eff66ed324"}, + {file = "numpy-2.1.0-cp313-cp313-win_amd64.whl", hash = "sha256:de844aaa4815b78f6023832590d77da0e3b6805c644c33ce94a1e449f16d6ab5"}, + {file = "numpy-2.1.0-cp313-cp313t-macosx_10_13_x86_64.whl", hash = "sha256:343e3e152bf5a087511cd325e3b7ecfd5b92d369e80e74c12cd87826e263ec06"}, + {file = "numpy-2.1.0-cp313-cp313t-macosx_11_0_arm64.whl", hash = "sha256:f07fa2f15dabe91259828ce7d71b5ca9e2eb7c8c26baa822c825ce43552f4883"}, + {file = "numpy-2.1.0-cp313-cp313t-macosx_14_0_arm64.whl", hash = "sha256:5474dad8c86ee9ba9bb776f4b99ef2d41b3b8f4e0d199d4f7304728ed34d0300"}, + {file = "numpy-2.1.0-cp313-cp313t-macosx_14_0_x86_64.whl", hash = "sha256:1f817c71683fd1bb5cff1529a1d085a57f02ccd2ebc5cd2c566f9a01118e3b7d"}, + {file = "numpy-2.1.0-cp313-cp313t-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:3a3336fbfa0d38d3deacd3fe7f3d07e13597f29c13abf4d15c3b6dc2291cbbdd"}, + {file = "numpy-2.1.0-cp313-cp313t-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:7a894c51fd8c4e834f00ac742abad73fc485df1062f1b875661a3c1e1fb1c2f6"}, + {file = "numpy-2.1.0-cp313-cp313t-musllinux_1_1_x86_64.whl", hash = "sha256:9156ca1f79fc4acc226696e95bfcc2b486f165a6a59ebe22b2c1f82ab190384a"}, + {file = "numpy-2.1.0-cp313-cp313t-musllinux_1_2_aarch64.whl", hash = "sha256:624884b572dff8ca8f60fab591413f077471de64e376b17d291b19f56504b2bb"}, + {file = "numpy-2.1.0-pp310-pypy310_pp73-macosx_10_15_x86_64.whl", hash = "sha256:15ef8b2177eeb7e37dd5ef4016f30b7659c57c2c0b57a779f1d537ff33a72c7b"}, + {file = "numpy-2.1.0-pp310-pypy310_pp73-macosx_14_0_x86_64.whl", hash = "sha256:e5f0642cdf4636198a4990de7a71b693d824c56a757862230454629cf62e323d"}, + {file = "numpy-2.1.0-pp310-pypy310_pp73-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:f15976718c004466406342789f31b6673776360f3b1e3c575f25302d7e789575"}, + {file = "numpy-2.1.0-pp310-pypy310_pp73-win_amd64.whl", hash = "sha256:6c1de77ded79fef664d5098a66810d4d27ca0224e9051906e634b3f7ead134c2"}, + {file = "numpy-2.1.0.tar.gz", hash = "sha256:7dc90da0081f7e1da49ec4e398ede6a8e9cc4f5ebe5f9e06b443ed889ee9aaa2"}, +] + +[[package]] +name = "packaging" +version = "24.1" +description = "Core utilities for Python packages" +optional = false +python-versions = ">=3.8" +files = [ + {file = "packaging-24.1-py3-none-any.whl", hash = "sha256:5b8f2217dbdbd2f7f384c41c628544e6d52f2d0f53c6d0c3ea61aa5d1d7ff124"}, + {file = "packaging-24.1.tar.gz", hash = "sha256:026ed72c8ed3fcce5bf8950572258698927fd1dbda10a5e981cdf0ac37f4f002"}, +] + +[[package]] +name = "pandas" +version = "2.2.2" +description = "Powerful data structures for data analysis, time series, and statistics" +optional = false +python-versions = ">=3.9" +files = [ + {file = "pandas-2.2.2-cp310-cp310-macosx_10_9_x86_64.whl", hash = "sha256:90c6fca2acf139569e74e8781709dccb6fe25940488755716d1d354d6bc58bce"}, + {file = "pandas-2.2.2-cp310-cp310-macosx_11_0_arm64.whl", hash = "sha256:c7adfc142dac335d8c1e0dcbd37eb8617eac386596eb9e1a1b77791cf2498238"}, + {file = "pandas-2.2.2-cp310-cp310-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:4abfe0be0d7221be4f12552995e58723c7422c80a659da13ca382697de830c08"}, + {file = "pandas-2.2.2-cp310-cp310-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:8635c16bf3d99040fdf3ca3db669a7250ddf49c55dc4aa8fe0ae0fa8d6dcc1f0"}, + {file = "pandas-2.2.2-cp310-cp310-musllinux_1_1_aarch64.whl", hash = "sha256:40ae1dffb3967a52203105a077415a86044a2bea011b5f321c6aa64b379a3f51"}, + {file = "pandas-2.2.2-cp310-cp310-musllinux_1_1_x86_64.whl", hash = "sha256:8e5a0b00e1e56a842f922e7fae8ae4077aee4af0acb5ae3622bd4b4c30aedf99"}, + {file = "pandas-2.2.2-cp310-cp310-win_amd64.whl", hash = "sha256:ddf818e4e6c7c6f4f7c8a12709696d193976b591cc7dc50588d3d1a6b5dc8772"}, + {file = "pandas-2.2.2-cp311-cp311-macosx_10_9_x86_64.whl", hash = "sha256:696039430f7a562b74fa45f540aca068ea85fa34c244d0deee539cb6d70aa288"}, + {file = "pandas-2.2.2-cp311-cp311-macosx_11_0_arm64.whl", hash = "sha256:8e90497254aacacbc4ea6ae5e7a8cd75629d6ad2b30025a4a8b09aa4faf55151"}, + {file = "pandas-2.2.2-cp311-cp311-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:58b84b91b0b9f4bafac2a0ac55002280c094dfc6402402332c0913a59654ab2b"}, + {file = "pandas-2.2.2-cp311-cp311-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:6d2123dc9ad6a814bcdea0f099885276b31b24f7edf40f6cdbc0912672e22eee"}, + {file = "pandas-2.2.2-cp311-cp311-musllinux_1_1_aarch64.whl", hash = "sha256:2925720037f06e89af896c70bca73459d7e6a4be96f9de79e2d440bd499fe0db"}, + {file = "pandas-2.2.2-cp311-cp311-musllinux_1_1_x86_64.whl", hash = "sha256:0cace394b6ea70c01ca1595f839cf193df35d1575986e484ad35c4aeae7266c1"}, + {file = "pandas-2.2.2-cp311-cp311-win_amd64.whl", hash = "sha256:873d13d177501a28b2756375d59816c365e42ed8417b41665f346289adc68d24"}, + {file = "pandas-2.2.2-cp312-cp312-macosx_10_9_x86_64.whl", hash = "sha256:9dfde2a0ddef507a631dc9dc4af6a9489d5e2e740e226ad426a05cabfbd7c8ef"}, + {file = "pandas-2.2.2-cp312-cp312-macosx_11_0_arm64.whl", hash = "sha256:e9b79011ff7a0f4b1d6da6a61aa1aa604fb312d6647de5bad20013682d1429ce"}, + {file = "pandas-2.2.2-cp312-cp312-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:1cb51fe389360f3b5a4d57dbd2848a5f033350336ca3b340d1c53a1fad33bcad"}, + {file = "pandas-2.2.2-cp312-cp312-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:eee3a87076c0756de40b05c5e9a6069c035ba43e8dd71c379e68cab2c20f16ad"}, + {file = "pandas-2.2.2-cp312-cp312-musllinux_1_1_aarch64.whl", hash = "sha256:3e374f59e440d4ab45ca2fffde54b81ac3834cf5ae2cdfa69c90bc03bde04d76"}, + {file = "pandas-2.2.2-cp312-cp312-musllinux_1_1_x86_64.whl", hash = "sha256:43498c0bdb43d55cb162cdc8c06fac328ccb5d2eabe3cadeb3529ae6f0517c32"}, + {file = "pandas-2.2.2-cp312-cp312-win_amd64.whl", hash = "sha256:d187d355ecec3629624fccb01d104da7d7f391db0311145817525281e2804d23"}, + {file = "pandas-2.2.2-cp39-cp39-macosx_10_9_x86_64.whl", hash = "sha256:0ca6377b8fca51815f382bd0b697a0814c8bda55115678cbc94c30aacbb6eff2"}, + {file = "pandas-2.2.2-cp39-cp39-macosx_11_0_arm64.whl", hash = "sha256:9057e6aa78a584bc93a13f0a9bf7e753a5e9770a30b4d758b8d5f2a62a9433cd"}, + {file = "pandas-2.2.2-cp39-cp39-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:001910ad31abc7bf06f49dcc903755d2f7f3a9186c0c040b827e522e9cef0863"}, + {file = "pandas-2.2.2-cp39-cp39-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:66b479b0bd07204e37583c191535505410daa8df638fd8e75ae1b383851fe921"}, + {file = "pandas-2.2.2-cp39-cp39-musllinux_1_1_aarch64.whl", hash = "sha256:a77e9d1c386196879aa5eb712e77461aaee433e54c68cf253053a73b7e49c33a"}, + {file = "pandas-2.2.2-cp39-cp39-musllinux_1_1_x86_64.whl", hash = "sha256:92fd6b027924a7e178ac202cfbe25e53368db90d56872d20ffae94b96c7acc57"}, + {file = "pandas-2.2.2-cp39-cp39-win_amd64.whl", hash = "sha256:640cef9aa381b60e296db324337a554aeeb883ead99dc8f6c18e81a93942f5f4"}, + {file = "pandas-2.2.2.tar.gz", hash = "sha256:9e79019aba43cb4fda9e4d983f8e88ca0373adbb697ae9c6c43093218de28b54"}, +] + +[package.dependencies] +numpy = [ + {version = ">=1.22.4", markers = "python_version < \"3.11\""}, + {version = ">=1.23.2", markers = "python_version == \"3.11\""}, + {version = ">=1.26.0", markers = "python_version >= \"3.12\""}, +] +python-dateutil = ">=2.8.2" +pytz = ">=2020.1" +tzdata = ">=2022.7" + +[package.extras] +all = ["PyQt5 (>=5.15.9)", "SQLAlchemy (>=2.0.0)", "adbc-driver-postgresql (>=0.8.0)", "adbc-driver-sqlite (>=0.8.0)", "beautifulsoup4 (>=4.11.2)", "bottleneck (>=1.3.6)", "dataframe-api-compat (>=0.1.7)", "fastparquet (>=2022.12.0)", "fsspec (>=2022.11.0)", "gcsfs (>=2022.11.0)", "html5lib (>=1.1)", "hypothesis (>=6.46.1)", "jinja2 (>=3.1.2)", "lxml (>=4.9.2)", "matplotlib (>=3.6.3)", "numba (>=0.56.4)", "numexpr (>=2.8.4)", "odfpy (>=1.4.1)", "openpyxl (>=3.1.0)", "pandas-gbq (>=0.19.0)", "psycopg2 (>=2.9.6)", "pyarrow (>=10.0.1)", "pymysql (>=1.0.2)", "pyreadstat (>=1.2.0)", "pytest (>=7.3.2)", "pytest-xdist (>=2.2.0)", "python-calamine (>=0.1.7)", "pyxlsb (>=1.0.10)", "qtpy (>=2.3.0)", "s3fs (>=2022.11.0)", "scipy (>=1.10.0)", "tables (>=3.8.0)", "tabulate (>=0.9.0)", "xarray (>=2022.12.0)", "xlrd (>=2.0.1)", "xlsxwriter (>=3.0.5)", "zstandard (>=0.19.0)"] +aws = ["s3fs (>=2022.11.0)"] +clipboard = ["PyQt5 (>=5.15.9)", "qtpy (>=2.3.0)"] +compression = ["zstandard (>=0.19.0)"] +computation = ["scipy (>=1.10.0)", "xarray (>=2022.12.0)"] +consortium-standard = ["dataframe-api-compat (>=0.1.7)"] +excel = ["odfpy (>=1.4.1)", "openpyxl (>=3.1.0)", "python-calamine (>=0.1.7)", "pyxlsb (>=1.0.10)", "xlrd (>=2.0.1)", "xlsxwriter (>=3.0.5)"] +feather = ["pyarrow (>=10.0.1)"] +fss = ["fsspec (>=2022.11.0)"] +gcp = ["gcsfs (>=2022.11.0)", "pandas-gbq (>=0.19.0)"] +hdf5 = ["tables (>=3.8.0)"] +html = ["beautifulsoup4 (>=4.11.2)", "html5lib (>=1.1)", "lxml (>=4.9.2)"] +mysql = ["SQLAlchemy (>=2.0.0)", "pymysql (>=1.0.2)"] +output-formatting = ["jinja2 (>=3.1.2)", "tabulate (>=0.9.0)"] +parquet = ["pyarrow (>=10.0.1)"] +performance = ["bottleneck (>=1.3.6)", "numba (>=0.56.4)", "numexpr (>=2.8.4)"] +plot = ["matplotlib (>=3.6.3)"] +postgresql = ["SQLAlchemy (>=2.0.0)", "adbc-driver-postgresql (>=0.8.0)", "psycopg2 (>=2.9.6)"] +pyarrow = ["pyarrow (>=10.0.1)"] +spss = ["pyreadstat (>=1.2.0)"] +sql-other = ["SQLAlchemy (>=2.0.0)", "adbc-driver-postgresql (>=0.8.0)", "adbc-driver-sqlite (>=0.8.0)"] +test = ["hypothesis (>=6.46.1)", "pytest (>=7.3.2)", "pytest-xdist (>=2.2.0)"] +xml = ["lxml (>=4.9.2)"] + +[[package]] +name = "pandas-schema" +version = "0.3.6" +description = "A validation library for Pandas data frames using user-friendly schemas" +optional = false +python-versions = "*" +files = [ + {file = "pandas_schema-0.3.6-py3-none-any.whl", hash = "sha256:7497621cdf8c191fca1ef6ded9caa6f2153b220f120a2686d921f80c8031994d"}, + {file = "pandas_schema-0.3.6.tar.gz", hash = "sha256:c6bfc52c4bae9cdd7420fbe8c4b0622b769457827c3fc819928405638caf605f"}, +] + +[package.dependencies] +numpy = "*" +packaging = "*" +pandas = ">=0.19" + +[[package]] +name = "python-dateutil" +version = "2.9.0.post0" +description = "Extensions to the standard Python datetime module" +optional = false +python-versions = "!=3.0.*,!=3.1.*,!=3.2.*,>=2.7" +files = [ + {file = "python-dateutil-2.9.0.post0.tar.gz", hash = "sha256:37dd54208da7e1cd875388217d5e00ebd4179249f90fb72437e91a35459a0ad3"}, + {file = "python_dateutil-2.9.0.post0-py2.py3-none-any.whl", hash = "sha256:a8b2bc7bffae282281c8140a97d3aa9c14da0b136dfe83f850eea9a5f7470427"}, +] + +[package.dependencies] +six = ">=1.5" + +[[package]] +name = "pytz" +version = "2024.1" +description = "World timezone definitions, modern and historical" +optional = false +python-versions = "*" +files = [ + {file = "pytz-2024.1-py2.py3-none-any.whl", hash = "sha256:328171f4e3623139da4983451950b28e95ac706e13f3f2630a879749e7a8b319"}, + {file = "pytz-2024.1.tar.gz", hash = "sha256:2a29735ea9c18baf14b448846bde5a48030ed267578472d8955cd0e7443a9812"}, +] + +[[package]] +name = "six" +version = "1.16.0" +description = "Python 2 and 3 compatibility utilities" +optional = false +python-versions = ">=2.7, !=3.0.*, !=3.1.*, !=3.2.*" +files = [ + {file = "six-1.16.0-py2.py3-none-any.whl", hash = "sha256:8abb2f1d86890a2dfb989f9a77cfcfd3e47c2a354b01111771326f8aa26e0254"}, + {file = "six-1.16.0.tar.gz", hash = "sha256:1e61c37477a1626458e36f7b1d82aa5c9b094fa4802892072e49de9c60c4c926"}, +] + +[[package]] +name = "tzdata" +version = "2024.1" +description = "Provider of IANA time zone data" +optional = false +python-versions = ">=2" +files = [ + {file = "tzdata-2024.1-py2.py3-none-any.whl", hash = "sha256:9068bc196136463f5245e51efda838afa15aaeca9903f49050dfa2679db4d252"}, + {file = "tzdata-2024.1.tar.gz", hash = "sha256:2674120f8d891909751c38abcdfd386ac0a5a1127954fbc332af6b5ceae07efd"}, +] + +[metadata] +lock-version = "2.0" +python-versions = ">=3.10" +content-hash = "a3e59d2093cbf1625eff71e74d9ffc6bbc19ad74bebc5f4c79fda4ad44aa9de1" diff --git a/pgscatalog.validate/poetry.toml b/pgscatalog.validate/poetry.toml new file mode 100644 index 0000000..384db5f --- /dev/null +++ b/pgscatalog.validate/poetry.toml @@ -0,0 +1,3 @@ +[virtualenvs] +create = true +in-project = true \ No newline at end of file diff --git a/pgscatalog.validate/pyproject.toml b/pgscatalog.validate/pyproject.toml new file mode 100644 index 0000000..76165fe --- /dev/null +++ b/pgscatalog.validate/pyproject.toml @@ -0,0 +1,29 @@ +[tool.poetry] +name = "pgscatalog.validate" +version = "0.1" +description = "Lorem ipsum" +authors = [ + "Benjamin Wingfield ", + "Samuel Lambert ", + "Laurent Gil ", + "Florent Yvon " +] +readme = "README.md" +packages = [ + { include = "pgscatalog", from = "src" }, +] + +[tool.poetry.dependencies] +python = ">=3.10" +pandas = "^2.2.0" +pandas-schema = "^0.3.6" + +[tool.poetry.scripts] +pgscatalog-validate = 'pgscatalog.validate.cli.validate_scorefile:validate_scorefile' + +[build-system] +requires = ["poetry-core>=1.0.0"] +build-backend = "poetry.core.masonry.api" + +[tool.coverage.run] +source = ['src/pgscatalog/validate'] \ No newline at end of file diff --git a/pgscatalog.validate/src/pgscatalog/validate/__init__.py b/pgscatalog.validate/src/pgscatalog/validate/__init__.py new file mode 100644 index 0000000..e69de29 diff --git a/pgscatalog.validate/src/pgscatalog/validate/cli/__init__.py b/pgscatalog.validate/src/pgscatalog/validate/cli/__init__.py new file mode 100644 index 0000000..e69de29 diff --git a/pgscatalog.validate/src/pgscatalog/validate/cli/validate_scorefile.py b/pgscatalog.validate/src/pgscatalog/validate/cli/validate_scorefile.py new file mode 100644 index 0000000..d1153e9 --- /dev/null +++ b/pgscatalog.validate/src/pgscatalog/validate/cli/validate_scorefile.py @@ -0,0 +1,171 @@ +import os, glob, re +import argparse +import logging +import textwrap + +data_sum = {'valid': [], 'invalid': [], 'other': []} + +val_types = ('formatted', 'hm_pos') + +logging.basicConfig(level=logging.INFO, format='(%(levelname)s): %(message)s') + + +def validate_scorefile() -> None: + global data_sum, score_dir + args = _parse_args() + _check_args(args) + + # Check PGS Catalog file name nomenclature + check_filename = False + if args.check_filename: + check_filename = True + else: + print("WARNING: the parameter '--check_filename' is not present in the submitted command line, therefore the validation of the scoring file name(s) won't be performed.") + + validator_type = args.t + files_dir = args.dir + log_dir = args.log_dir + + ## Select validator class ## + if validator_type == 'formatted': + import pgscatalog.validate.lib.formatted.validator as validator_package + elif validator_type == 'hm_pos': + import pgscatalog.validate.lib.harmonized_position.validator as validator_package + + ## Run validator ## + # One file + if args.f: + _run_validator(args.f,log_dir,score_dir,validator_package,check_filename,validator_type) + # Content of the directory + elif files_dir: + count_files = 0 + # Browse directory: for each file run validator + for filepath in sorted(glob.glob(files_dir+"/*.*")): + _run_validator(filepath,log_dir,score_dir,validator_package,check_filename,validator_type) + count_files += 1 + + # Print summary + results + print("\nSummary:") + if data_sum['valid']: + print(f"- Valid: {len(data_sum['valid'])}/{count_files}") + if data_sum['invalid']: + print(f"- Invalid: {len(data_sum['invalid'])}/{count_files}") + if data_sum['other']: + print(f"- Other issues: {len(data_sum['other'])}/{count_files}") + + if data_sum['invalid']: + print("Invalid files:") + print("\n".join(data_sum['invalid'])) + + +def _read_last_line(file: str) -> str: + ''' + Return the last line of the file + ''' + fileHandle = open ( file,"r" ) + lineList = fileHandle.readlines() + fileHandle.close() + return lineList[-1] + + +def _file_validation_state(filename: str, log_file: str) -> None: + global data_sum + if os.path.exists(log_file): + log_result = _read_last_line(log_file) + if re.search("File is valid", log_result): + print("> valid\n") + data_sum['valid'].append(filename) + elif re.search("File is invalid", log_result): + print("#### invalid! ####\n") + data_sum['invalid'].append(filename) + else:# + print("!! validation process had an issue. Please look at the logs.\n") + data_sum['other'].append(filename) + else: + print("!! validation process had an issue: the log file can't be found") + data_sum['other'].append(filename) + + +def _check_args(args: argparse.Namespace) -> None: + global score_dir + + ## Check parameters ## + # Type of validator + if args.t not in val_types: + print(f"Error: Validator type (option -t) '{args.t}' is not in the list of recognized types: {val_types}.") + exit(1) + # Logs dir + if not os.path.isdir(args.log_dir): + print(f"Error: Log dir '{args.log_dir}' can't be found!") + exit(1) + # File and directory parameters (only one of the '-f' and '--dir' can be used) + if args.f and args.dir: + print("Error: you can't use both options [-f] - single scoring file and [--dir] - directory of scoring files. Please use only 1 of these 2 options!") + exit(1) + elif not args.f and not args.dir: + print("Error: you need to provide a scoring file [-f] or a directory of scoring files [--dir]!") + exit(1) + elif args.f and not os.path.isfile(args.f): + print(f"Error: Scoring file '{args.f}' can't be found!") + exit(1) + elif args.dir and not os.path.isdir(args.dir): + print(f"Error: the scoring file directory '{args.dir}' can't be found!") + exit(1) + # Scoring files directory (only to compare with the harmonized files) + score_dir = None + if args.score_dir: + score_dir = args.score_dir + if not os.path.isdir(score_dir): + print(f"Error: Scoring file directory '{score_dir}' can't be found!") + exit(1) + elif args.t != 'formatted': + print("WARNING: the parameter '--score_dir' is not present in the submitted command line, therefore the comparison of the number of data rows between the formatted scoring file(s) and the harmonized scoring file(s) won't be performed.") + + +def _run_validator(filepath: str, log_dir: str, score_dir: str, validator_package: object, check_filename: bool, validator_type: str) -> None: + ''' Run the file validator ''' + file = os.path.basename(filepath) + filename = file.split('.')[0] + print(f"# Filename: {file}") + log_file = f'{log_dir}/{filename}_log.txt' + + # Run validator + validator = validator_package.init_validator(filepath,log_file,score_dir) + if check_filename: + validator.run_validator() + else: + validator.run_validator_skip_check_filename() + + # Check log + _file_validation_state(file,log_file) + + +def _description_text() -> str: + return textwrap.dedent('''\ + Validate a set of scoring files to match the PGS Catalog scoring file formats. + It can validate: + - The formatted scoring file format (https://www.pgscatalog.org/downloads/#dl_ftp_scoring) + - The harmonized (Position) scoring file format (https://www.pgscatalog.org/downloads/#dl_ftp_scoring_hm_pos) + ''') + + +def _epilog_text() -> str: + return textwrap.dedent(f'''\ + You need to specify the type of file format to validate, using the paramter '-t' ({' or '.join(val_types)}). + ''') + + +def _parse_args(args=None) -> argparse.Namespace: + parser = argparse.ArgumentParser(description=_description_text(), epilog=_epilog_text(), + formatter_class=argparse.RawDescriptionHelpFormatter) + parser.add_argument("-t", help=f"Type of validator: {' or '.join(val_types)}", metavar='VALIDATOR_TYPE') + parser.add_argument("-f", help='The path to the polygenic scoring file to be validated (no need to use the [--dir] option)', metavar='SCORING_FILE_NAME') + parser.add_argument('--dir', help='The name of the directory containing the files that need to processed (no need to use the [-f] option') + parser.add_argument('--score_dir', help=' The name of the directory containing the formatted scoring files to compare with harmonized scoring files') + parser.add_argument('--log_dir', help='The name of the log directory where the log file(s) will be stored', required=True) + parser.add_argument('--check_filename', help=' Check that the file name match the PGS Catalog nomenclature', required=False, action='store_true') + return parser.parse_args(args) + + +if __name__ == '__main__': + validate_scorefile() diff --git a/pgscatalog.validate/src/pgscatalog/validate/lib/__init__.py b/pgscatalog.validate/src/pgscatalog/validate/lib/__init__.py new file mode 100644 index 0000000..e69de29 diff --git a/pgscatalog.validate/src/pgscatalog/validate/lib/common_constants.py b/pgscatalog.validate/src/pgscatalog/validate/lib/common_constants.py new file mode 100644 index 0000000..768752a --- /dev/null +++ b/pgscatalog.validate/src/pgscatalog/validate/lib/common_constants.py @@ -0,0 +1,44 @@ +SNP_DSET = 'rsID' +CHR_DSET = 'chr_name' +BP_DSET = 'chr_position' +EFFECT_DSET = 'effect_allele' +OTH_DSET = 'other_allele' +EFFECT_WEIGHT_DSET = 'effect_weight' + +# Other columns +LOCUS_DSET = 'locus_name' +OR_DSET = 'OR' +HR_DSET = 'HR' +BETA_DSET = 'beta' +FREQ_DSET = 'allelefrequency_effect' +FLAG_INTERACTION_DSET = 'is_interaction' +FLAG_RECESSIVE_DSET = 'is_recessive' +FLAG_HAPLOTYPE_DSET = 'is_haplotype' +FLAG_DIPLOTYPE_DSET = 'is_diplotype' +METHOD_DSET = 'imputation_method' +SNP_DESC_DSET = 'variant_description' +INCLUSION_DSET = 'inclusion_criteria' +DOSAGE_0_WEIGHT = 'dosage_0_weight' +DOSAGE_1_WEIGHT = 'dosage_1_weight' +DOSAGE_2_WEIGHT = 'dosage_2_weight' +# hmPOS +HM_SOURCE_DSET = 'hm_source' +HM_SNP_DSET = 'hm_rsID' +HM_CHR_DSET = 'hm_chr' +HM_BP_DSET = 'hm_pos' +HM_OTH_DSET = 'hm_inferOtherAllele' +HM_MATCH_CHR_DSET = 'hm_match_chr' +HM_MATCH_BP_DSET = 'hm_match_pos' +# hmFinal +VARIANT_DSET = 'variant_id' +HM_CODE_DSET = 'hm_code' +HM_INFO_DSET = 'hm_info' + + +DSET_TYPES = {SNP_DSET: str, CHR_DSET: str, BP_DSET: int, EFFECT_DSET: str, OTH_DSET: str, + EFFECT_WEIGHT_DSET: float, VARIANT_DSET: str, HM_CODE_DSET: int, HM_INFO_DSET: str, LOCUS_DSET: str, OR_DSET: float, HR_DSET: float, BETA_DSET: float, FREQ_DSET: float, + FLAG_INTERACTION_DSET: str, FLAG_RECESSIVE_DSET: str, FLAG_HAPLOTYPE_DSET: str, FLAG_DIPLOTYPE_DSET: str, + METHOD_DSET: str, SNP_DESC_DSET: str, INCLUSION_DSET: str, DOSAGE_0_WEIGHT: float, DOSAGE_1_WEIGHT: float, DOSAGE_2_WEIGHT: float, + HM_SOURCE_DSET:str, HM_SNP_DSET: str, HM_CHR_DSET: str, HM_BP_DSET: int, HM_OTH_DSET: str, HM_MATCH_CHR_DSET: str, HM_MATCH_BP_DSET: int} + +TO_DISPLAY_ORDER = [ SNP_DSET, CHR_DSET, BP_DSET, EFFECT_DSET, OTH_DSET, EFFECT_WEIGHT_DSET, LOCUS_DSET, OR_DSET, HR_DSET, HM_CODE_DSET, HM_INFO_DSET, HM_SOURCE_DSET, HM_SNP_DSET, HM_BP_DSET, HM_OTH_DSET, HM_MATCH_CHR_DSET, HM_MATCH_BP_DSET] \ No newline at end of file diff --git a/pgscatalog.validate/src/pgscatalog/validate/lib/formatted/__init__.py b/pgscatalog.validate/src/pgscatalog/validate/lib/formatted/__init__.py new file mode 100644 index 0000000..e69de29 diff --git a/pgscatalog.validate/src/pgscatalog/validate/lib/formatted/validator.py b/pgscatalog.validate/src/pgscatalog/validate/lib/formatted/validator.py new file mode 100644 index 0000000..3a3bfc0 --- /dev/null +++ b/pgscatalog.validate/src/pgscatalog/validate/lib/formatted/validator.py @@ -0,0 +1,197 @@ +import gzip +import re +from pandas_schema import Schema +from ..schemas import * +from ..validator_base import * + +''' +PGS Catalog Harmonized file validator +- using pandas_schema https://github.com/TMiguelT/PandasSchema +''' + +class ValidatorFormatted(ValidatorBase): + + def __init__(self, file, score_dir=None, logfile="VALIDATE.log", error_limit=0): + super().__init__(file, score_dir, logfile, error_limit) + self.score_dir=None + self.meta_format = FORMATTED_META_GENERIC + self.schema_validators = FORMATTED_VALIDATORS + self.valid_cols = VALID_COLS_FORMATTED + self.valid_type = VALID_TYPE_FORMATTED + self.setup_field_validation() + + + def extract_specific_metadata(self,line): + ''' Extract some of the metadata. ''' + match_variants_number = re.search(r'#variants_number=(\d+)', line) + if match_variants_number: + self.variants_number = int(match_variants_number.group(1)) + + + def get_and_check_variants_number(self): + ''' Verify that the number of variant lines corresponds to the number of variants in the headers ''' + variant_lines = 0 + + with gzip.open( self.file, 'rb') as f: + line_number = 0 + for line in f: + line_number += 1 + line = line.decode('utf-8').rstrip() + if line.startswith('#'): + match_variants_number = re.search(r'#variants_number=(\d+)', line) + if match_variants_number: + self.variants_number = int(match_variants_number.group(1)) + else: + variant_lines += 1 + if re.search(r'\w+', line): # Line not empty + cols = line.split(self.sep) + has_trailing_spaces = self.check_leading_trailing_spaces(cols,line_number) + if has_trailing_spaces: + self.global_errors += 1 + else: + self.logger.error(f'- Line {line_number} is empty') + self.global_errors += 1 + + if self.variants_number: + variant_lines -= 1 # Remove the header line from the count + if self.variants_number != variant_lines: + self.logger.error(f'- The number of variants lines in the file ({variant_lines}) and the number of variants declared in the headers ({self.variants_number}) are different') + self.global_errors += 1 + else: + self.logger.error("- Can't retrieve the number of variants from the headers") + self.global_errors += 1 + + + def detect_duplicated_rows(self,dataframe_chunk): + ''' Detect duplicated rows in the scoring file. ''' + # Columns of interest to compare the different rows + cols_sel = [] + for col in ['rsID','chr_name','chr_position','effect_allele','other_allele']: + if col in self.cols_to_validate: + cols_sel.append(col) + + duplicate_status = dataframe_chunk.duplicated(cols_sel) + if any(duplicate_status): + duplicated_rows = dataframe_chunk[duplicate_status] + self.logger.error(f'Duplicated row(s) found: {len(duplicated_rows.index)}\n\t-> {duplicated_rows.to_string(header=False,index=False)}') + self.global_errors += 1 + for index in duplicated_rows.index: + self.bad_rows.append(index) + + + def validate_data(self) -> bool: + ''' Validate the file: data format and data content ''' + self.logger.info("Validating data...") + if not self.open_file_and_check_for_squareness(): + self.logger.error("Please fix the table. Some rows have different numbers of columns to the header") + self.logger.info("Rows with different numbers of columns to the header are not validated") + # Check the consitence between the declared variants number and the actual number of variants in the file + self.get_and_check_variants_number() + + for chunk in self.df_iterator(self.file): + dataframe_to_validate = chunk[self.cols_to_read] + dataframe_to_validate.columns = self.cols_to_validate # sets the headers to standard format if neeeded + + # Detect duplicated rows + self.detect_duplicated_rows(dataframe_to_validate) + + # validate the snp column if present + if SNP_DSET in self.header: + sub_schema = FORMATTED_VALIDATORS_SNP + if CHR_DSET and BP_DSET in self.header: + sub_schema = FORMATTED_VALIDATORS_SNP_EMPTY + self.validate_schema(sub_schema,dataframe_to_validate) + + if CHR_DSET and BP_DSET in self.header: + self.validate_schema(FORMATTED_VALIDATORS_POS, dataframe_to_validate) + + if OR_DSET in self.header: + self.validate_schema(FORMATTED_VALIDATORS_OR,dataframe_to_validate) + + if HR_DSET in self.header: + self.validate_schema(FORMATTED_VALIDATORS_HR,dataframe_to_validate) + + self.process_errors() + if len(self.bad_rows) >= self.error_limit: + break + if not self.bad_rows and not self.global_errors: + if self.is_file_valid(): + self.logger.info("File is valid") + else: + self.logger.info("File is invalid") + else: + self.logger.info("File is invalid - {} bad rows, limit set to {}".format(len(self.bad_rows), self.error_limit)) + self.set_file_is_invalid() + return self.is_file_valid() + + + def validate_filename(self) -> bool: + ''' Validate the file name structure. ''' + self.logger.info("Validating file name...") + filename = self.file.split('/')[-1].split('.')[0] + is_valid_filename = True + if not re.match(r'^PGS\d{6}$', filename): + self.logger.info("Invalid filename: {}".format(self.file)) + self.logger.error("Filename: {} should follow the pattern 'PGSXXXXXX.txt.gz', where the 'X' are the 6 digits of the PGS identifier (e.g. PGS000001)".format(filename)) + is_valid_filename = False + self.set_file_is_invalid() + + return is_valid_filename + + + def validate_headers(self) -> bool: + ''' Validate the list of column names. ''' + self.logger.info("Validating headers...") + self.detect_genomebuild_with_rsid() + required_is_subset = set(STD_COLS_VAR_FORMATTED).issubset(self.header) + if not required_is_subset: + # check if everything but snp: + required_is_subset = set(CHR_COLS_VAR_FORMATTED).issubset(self.header) + if not required_is_subset: + required_is_subset = set(SNP_COLS_VAR_FORMATTED).issubset(self.header) + if not required_is_subset: + self.logger.error("Required headers: {} are not in the file header: {}".format(STD_COLS_VAR_FORMATTED, self.header)) + + # Check if at least one of the effect columns is there + has_effect_col = 0 + for col in STD_COLS_EFFECT_FORMATTED: + if set([col]).issubset(self.header): + has_effect_col = 1 + break + if not has_effect_col: + self.logger.error("Required headers: at least one of the columns '{}' must be in the file header: {}".format(STD_COLS_EFFECT_FORMATTED, self.header)) + required_is_subset = None + + if not required_is_subset: + self.logger.info("Invalid headers...exiting before any further checks") + self.set_file_is_invalid() + + return required_is_subset + + + def detect_genomebuild_with_rsid(self): + ''' The column "rsID" should always be in the scoring file when the genome build is not reported (i.e. "NR") ''' + self.get_genomebuild() + if self.genomebuild == 'NR': + if SNP_DSET not in self.header: + self.logger.error(f"- The combination: Genome Build = '{self.genomebuild}' & the missing column '{SNP_DSET}' in the header is not allowed as we have to manually guess the genome build.") + self.global_errors += 1 + + + def get_genomebuild(self): + ''' Retrieve the Genome Build from the comments ''' + with gzip.open(self.file, 'rb') as f_in: + for f_line in f_in: + line = f_line.decode() + # Update header + if line.startswith('#genome_build'): + gb = (line.split('='))[1] + self.genomebuild = gb.strip() + return + + +################################################################## + +def init_validator(file, logfile, score_dir=None) -> ValidatorFormatted: + validator = ValidatorFormatted(file=file, score_dir=score_dir, logfile=logfile) + return validator \ No newline at end of file diff --git a/pgscatalog.validate/src/pgscatalog/validate/lib/harmonized_position/__init__.py b/pgscatalog.validate/src/pgscatalog/validate/lib/harmonized_position/__init__.py new file mode 100644 index 0000000..e69de29 diff --git a/pgscatalog.validate/src/pgscatalog/validate/lib/harmonized_position/validator.py b/pgscatalog.validate/src/pgscatalog/validate/lib/harmonized_position/validator.py new file mode 100644 index 0000000..62d3348 --- /dev/null +++ b/pgscatalog.validate/src/pgscatalog/validate/lib/harmonized_position/validator.py @@ -0,0 +1,98 @@ +import re +from ..schemas import * +from ..validator_base import * + +''' +PGS Catalog Harmonized file validator +- using pandas_schema https://github.com/TMiguelT/PandasSchema +''' + +class ValidatorPos(ValidatorBase): + ''' Validator for the HmPOS Harmonized file format. ''' + + def __init__(self, file, score_dir=None, logfile="VALIDATE.log", error_limit=0): + super().__init__(file, score_dir, logfile, error_limit) + self.meta_format = HM_META_POS + self.schema_validators = POS_VALIDATORS + self.valid_cols = VALID_COLS_POS + self.valid_type = VALID_TYPE_POS + self.setup_field_validation() + + + def extract_specific_metadata(self,line): + ''' Extract some of the metadata. ''' + match_variants_number = re.search(r'#variants_number=(\d+)', line) + if match_variants_number: + self.variants_number = int(match_variants_number.group(1)) + + + def validate_line_content(self,cols_content,var_line_number): + ''' Populate the abstract method from ValidatorBase, to check some data in esch row. ''' + # Check lines + line_dict = dict(zip(self.header, cols_content)) + line_cols = line_dict.keys() + # Check each chromosome data is consistent + chr_cols = ['chr_name', 'hm_chr', 'hm_match_chr'] + if all(col_name in line_cols for col_name in chr_cols): + if line_dict['chr_name'] == line_dict['hm_chr'] and line_dict['hm_match_chr'] != 'True': + self.logger.error(f"- Variant line {var_line_number} | 'hm_match_chr' should be 'True': same chromosome ('chr_name={line_dict['chr_name']}' vs 'hm_chr={line_dict['hm_chr']}')") + # Check each position data is consistent + pos_cols = ['chr_position', 'hm_pos', 'hm_match_pos'] + if all(col_name in line_cols for col_name in pos_cols): + if line_dict['chr_position'] == line_dict['hm_pos'] and line_dict['hm_match_pos'] != 'True': + self.logger.error(f"- Variant line {var_line_number} | 'hm_match_pos' should be 'True': same position ('chr_position={line_dict['chr_position']}' vs 'hm_pos={line_dict['hm_pos']}')") + + + def validate_filename(self) -> bool: + ''' Validate the file name structure. ''' + self.logger.info("Validating file name...") + pgs_id, build = None, None + is_valid_filename = True + # hmPOS + filename = self.file.split('/')[-1].split('.')[0] + filename_parts = filename.split('_hmPOS_') + if len(filename_parts) != 2: + self.logger.error("Filename: {} should follow the pattern _hmPOS_.txt.gz [build=GRChXX]".format(filename)) + self.set_file_is_invalid() + is_valid_filename = False + else: + pgs_id, build = filename_parts + self.file_pgs_id = pgs_id + self.file_genomebuild = build + if not self.check_build_is_legit(build): + self.logger.error("Build: {} is not an accepted build value".format(build)) + self.set_file_is_invalid() + is_valid_filename = False + + return is_valid_filename + + + def validate_headers(self) -> bool: + ''' Validate the list of column names. ''' + self.logger.info("Validating headers...") + # Check if it has at least a "SNP" column or a "chromosome" column + required_is_subset = set(STD_COLS_VAR_POS).issubset(self.header) + if not required_is_subset: + self.logger.error("Required headers: {} are not in the file header: {}".format(STD_COLS_VAR_POS, self.header)) + + # Check if it has at least a "SNP" column or a "chromosome" column + required_pos = set(SNP_COLS_VAR_POS).issubset(self.header) + if not required_pos: + # check if everything but snp: + required_pos = set(CHR_COLS_VAR_POS).issubset(self.header) + if not required_pos: + self.logger.error("One of the following required header is missing: '{}' and/or '{}' are not in the file header: {}".format(SNP_COLS_VAR_POS, CHR_COLS_VAR_POS, self.header)) + required_is_subset = required_pos + + if not required_is_subset: + self.logger.info("Invalid headers...exiting before any further checks") + self.set_file_is_invalid() + + return required_is_subset + + +################################################################## + +def init_validator(file, logfile, score_dir=None) -> ValidatorPos: + validator = ValidatorPos(file=file, score_dir=score_dir, logfile=logfile) + return validator \ No newline at end of file diff --git a/pgscatalog.validate/src/pgscatalog/validate/lib/helpers.py b/pgscatalog.validate/src/pgscatalog/validate/lib/helpers.py new file mode 100644 index 0000000..7d786e5 --- /dev/null +++ b/pgscatalog.validate/src/pgscatalog/validate/lib/helpers.py @@ -0,0 +1,29 @@ +import math +import pandas as pd +from pandas_schema.validation import _SeriesValidation + + +class InInclusiveRangeValidation(_SeriesValidation): + """ + Checks that each element in the series is within a given inclusive numerical range. + Doesn't care if the values are not numeric - it will try anyway. + """ + def __init__(self, min: float = -math.inf, max: float = math.inf, **kwargs): + """ + :param min: The minimum (inclusive) value to accept + :param max: The maximum (inclusive) value to accept + """ + self.min = min + self.max = max + super().__init__(**kwargs) + + @property + def default_message(self): + return 'was not in the range [{}, {})'.format(self.min, self.max) + + def validate(self, series: pd.Series) -> pd.Series: + series = pd.to_numeric(series, errors='coerce') + return (series >= self.min) & (series <= self.max) + + + diff --git a/pgscatalog.validate/src/pgscatalog/validate/lib/schemas.py b/pgscatalog.validate/src/pgscatalog/validate/lib/schemas.py new file mode 100644 index 0000000..29d7e67 --- /dev/null +++ b/pgscatalog.validate/src/pgscatalog/validate/lib/schemas.py @@ -0,0 +1,157 @@ +import numpy as np +from pandas_schema import Column +from pandas_schema.validation import MatchesPatternValidation, InListValidation, CanConvertValidation, LeadingWhitespaceValidation, TrailingWhitespaceValidation, CustomElementValidation +from .helpers import InInclusiveRangeValidation +from .common_constants import * + + +#### Validation types #### + +VALID_TYPE_FORMATTED = 'formatted' +VALID_TYPE_POS = 'hm_pos' + + +#### Columns #### + +# Formatted scoring files +STD_COLS_VAR_FORMATTED = (EFFECT_DSET, CHR_DSET, BP_DSET, SNP_DSET) #OR_DSET, RANGE_L_DSET, RANGE_U_DSET, BETA_DSET, SE_DSET, FREQ_DSET , EFFECT_DSET, OTH_DSET) + +SNP_COLS_VAR_FORMATTED = (EFFECT_DSET, CHR_DSET, BP_DSET) +CHR_COLS_VAR_FORMATTED = (EFFECT_DSET, SNP_DSET) + +STD_COLS_EFFECT_FORMATTED = (EFFECT_WEIGHT_DSET,OR_DSET,HR_DSET) + +VALID_COLS_FORMATTED = (EFFECT_WEIGHT_DSET, OR_DSET, HR_DSET, BETA_DSET, FREQ_DSET, LOCUS_DSET, EFFECT_DSET, OTH_DSET, CHR_DSET, BP_DSET, SNP_DSET) + +# Harmonized scoring files - POS +STD_COLS_VAR_POS = (HM_SOURCE_DSET, HM_CHR_DSET, HM_BP_DSET) + +SNP_COLS_VAR_POS = (SNP_DSET, HM_SNP_DSET) +CHR_COLS_VAR_POS = (CHR_DSET,) + +VALID_COLS_POS = (HM_SOURCE_DSET, HM_SNP_DSET, HM_CHR_DSET, HM_BP_DSET, HM_OTH_DSET, HM_MATCH_CHR_DSET, HM_MATCH_BP_DSET) + +# Harmonized scoring files - Final +STD_COLS_VAR_FINAL = (EFFECT_DSET, EFFECT_WEIGHT_DSET, HM_CODE_DSET, HM_INFO_DSET) + +SNP_COLS_VAR_FINAL = (VARIANT_DSET,) +CHR_COLS_VAR_FINAL = (CHR_DSET, HM_CHR_DSET) + +VALID_COLS_FINAL = (SNP_DSET, CHR_DSET, BP_DSET, EFFECT_DSET, OTH_DSET, EFFECT_WEIGHT_DSET, LOCUS_DSET, HM_CODE_DSET, HM_SNP_DSET, HM_CHR_DSET, HM_BP_DSET, HM_OTH_DSET, HM_MATCH_CHR_DSET, HM_MATCH_BP_DSET) + + +#### Global variables #### + +VALID_CHROMOSOMES = ['1', '2', '3', '4', '5', '6', '7', '8', + '9', '10', '11', '12', '13', '14', '15', '16', + '17', '18', '19', '20', '21', '22', + 'X', 'x', 'Y', 'y', 'XY', 'xy', 'MT', 'Mt', 'mt'] + +VALID_FILE_EXTENSIONS = [".txt", ".txt.gz"] + +# For the harmonized files +VALID_SOURCES = ['ENSEMBL','Author-reported'] +# VALID_CODES = ['5','4','3','1','0','-1','-4','-5'] +BUILD_LIST = ['GRCh37','GRCh38'] + + +error_msg = 'this column cannot be null/empty' +null_validation = CustomElementValidation(lambda d: d is not np.nan and d != '', error_msg) + + +#### Validators #### + +# Generic/shared validators +GENERIC_VALIDATORS = { + CHR_DSET: Column(CHR_DSET, [InListValidation(VALID_CHROMOSOMES)], allow_empty=True), + BP_DSET: Column(BP_DSET, [CanConvertValidation(DSET_TYPES[BP_DSET]), InInclusiveRangeValidation(1, 999999999)], allow_empty=True), + EFFECT_WEIGHT_DSET: Column(EFFECT_WEIGHT_DSET, [CanConvertValidation(DSET_TYPES[EFFECT_WEIGHT_DSET]), null_validation], allow_empty=False), + EFFECT_DSET: Column(EFFECT_DSET, [MatchesPatternValidation(r'^[ACTGN\-]+$')], allow_empty=False), + OTH_DSET: Column(OTH_DSET, [MatchesPatternValidation(r'^[ACTGN\-]+$')], allow_empty=True), + LOCUS_DSET: Column(LOCUS_DSET, [CanConvertValidation(DSET_TYPES[LOCUS_DSET]), LeadingWhitespaceValidation(), TrailingWhitespaceValidation(), null_validation], allow_empty=True) +} + +# Formatted validators +FORMATTED_VALIDATORS = {k:v for k,v in GENERIC_VALIDATORS.items()} +FORMATTED_VALIDATORS[SNP_DSET] = Column(SNP_DSET, [CanConvertValidation(DSET_TYPES[SNP_DSET]), MatchesPatternValidation(r'^(\.|(rs|HLA\-\w+\*)[0-9]+)$')], allow_empty=True) +FORMATTED_VALIDATORS[OR_DSET] = Column(OR_DSET, [CanConvertValidation(DSET_TYPES[OR_DSET]), null_validation], allow_empty=True) +FORMATTED_VALIDATORS[HR_DSET] = Column(HR_DSET, [CanConvertValidation(DSET_TYPES[HR_DSET]), null_validation], allow_empty=True) +FORMATTED_VALIDATORS[BETA_DSET] = Column(BETA_DSET, [CanConvertValidation(DSET_TYPES[BETA_DSET]), null_validation], allow_empty=True) +FORMATTED_VALIDATORS[FREQ_DSET] = Column(FREQ_DSET, [CanConvertValidation(DSET_TYPES[FREQ_DSET]), null_validation], allow_empty=True) +FORMATTED_VALIDATORS[DOSAGE_0_WEIGHT] = Column(DOSAGE_0_WEIGHT, [CanConvertValidation(DSET_TYPES[DOSAGE_0_WEIGHT]), null_validation], allow_empty=True) +FORMATTED_VALIDATORS[DOSAGE_1_WEIGHT] = Column(DOSAGE_1_WEIGHT, [CanConvertValidation(DSET_TYPES[DOSAGE_1_WEIGHT]), null_validation], allow_empty=True) +FORMATTED_VALIDATORS[DOSAGE_2_WEIGHT] = Column(DOSAGE_2_WEIGHT, [CanConvertValidation(DSET_TYPES[DOSAGE_2_WEIGHT]), null_validation], allow_empty=True) + +FORMATTED_VALIDATORS_SNP = {k:v for k,v in FORMATTED_VALIDATORS.items()} +FORMATTED_VALIDATORS_SNP[SNP_DSET] = Column(SNP_DSET, [CanConvertValidation(DSET_TYPES[SNP_DSET]), MatchesPatternValidation(r'^(\.|(rs|HLA\-\w+\*)[0-9]+)$')], allow_empty=False) + +FORMATTED_VALIDATORS_SNP_EMPTY = {k:v for k,v in FORMATTED_VALIDATORS.items()} +FORMATTED_VALIDATORS_SNP_EMPTY[SNP_DSET] = Column(SNP_DSET, [CanConvertValidation(DSET_TYPES[SNP_DSET]), MatchesPatternValidation(r'^(rs[0-9]+|HLA\-\w+\*[0-9]+|nan|\.)$')], allow_empty=False) +FORMATTED_VALIDATORS_SNP_EMPTY[CHR_DSET] = Column(CHR_DSET, [InListValidation(VALID_CHROMOSOMES)], allow_empty=False) +FORMATTED_VALIDATORS_SNP_EMPTY[BP_DSET] = Column(BP_DSET, [CanConvertValidation(DSET_TYPES[BP_DSET]), InInclusiveRangeValidation(1, 999999999)], allow_empty=False) + +FORMATTED_VALIDATORS_POS = {k:v for k,v in FORMATTED_VALIDATORS.items()} +FORMATTED_VALIDATORS_POS[CHR_DSET] = Column(CHR_DSET, [InListValidation(VALID_CHROMOSOMES)], allow_empty=False) +FORMATTED_VALIDATORS_POS[BP_DSET] = Column(BP_DSET, [CanConvertValidation(DSET_TYPES[BP_DSET]), InInclusiveRangeValidation(1, 999999999)], allow_empty=False) + +FORMATTED_VALIDATORS_OR = {k:v for k,v in FORMATTED_VALIDATORS.items()} +FORMATTED_VALIDATORS_OR[OR_DSET] = Column(OR_DSET, [CanConvertValidation(DSET_TYPES[OR_DSET])], allow_empty=False) + +FORMATTED_VALIDATORS_HR = {k:v for k,v in FORMATTED_VALIDATORS.items()} +FORMATTED_VALIDATORS_HR[HR_DSET] = Column(HR_DSET, [CanConvertValidation(DSET_TYPES[HR_DSET])], allow_empty=False) + +# Position validators +POS_VALIDATORS = {} +POS_VALIDATORS[HR_DSET] = Column(HR_DSET, [CanConvertValidation(DSET_TYPES[HR_DSET]), null_validation], allow_empty=True) +POS_VALIDATORS[HM_SOURCE_DSET] = Column(HM_SOURCE_DSET, [CanConvertValidation(DSET_TYPES[HM_SOURCE_DSET]), InListValidation(VALID_SOURCES), LeadingWhitespaceValidation(), TrailingWhitespaceValidation(), null_validation], allow_empty=False) +POS_VALIDATORS[HM_SNP_DSET] = Column(HM_SNP_DSET, [CanConvertValidation(DSET_TYPES[HM_SNP_DSET]), MatchesPatternValidation(r'^(rs|HLA\-\w+\*)[0-9]+$')], allow_empty=True) +POS_VALIDATORS[HM_CHR_DSET] = Column(HM_CHR_DSET, [InListValidation(VALID_CHROMOSOMES)], allow_empty=True) +POS_VALIDATORS[HM_BP_DSET] = Column(HM_BP_DSET, [CanConvertValidation(DSET_TYPES[HM_BP_DSET]), InInclusiveRangeValidation(1, 999999999)], allow_empty=True) +POS_VALIDATORS[HM_OTH_DSET] = Column(HM_OTH_DSET, [MatchesPatternValidation(r'^[ACTGN\-\/]+$')], allow_empty=True) +POS_VALIDATORS[HM_MATCH_CHR_DSET] = Column(HM_MATCH_CHR_DSET, [InListValidation(['True', 'False'])], allow_empty=True) +POS_VALIDATORS[HM_MATCH_BP_DSET] = Column(HM_MATCH_BP_DSET, [InListValidation(['True', 'False'])], allow_empty=True) + +# Final validator +# FINAL_VALIDATORS = {k:v for k,v in GENERIC_VALIDATORS.items()} +# FINAL_VALIDATORS[EFFECT_DSET] = Column(EFFECT_DSET, [MatchesPatternValidation(r'^[ACTGN\-]+$')], allow_empty=True) +# FINAL_VALIDATORS[OTH_DSET] = Column(OTH_DSET, [MatchesPatternValidation(r'^[ACTGN\-\.]+$')], allow_empty=True) +# FINAL_VALIDATORS[VARIANT_DSET] = Column(VARIANT_DSET, [CanConvertValidation(DSET_TYPES[VARIANT_DSET]), MatchesPatternValidation(r'^((rs|HLA\-\w+\*)[0-9]+|\.)$')], allow_empty=True) +# FINAL_VALIDATORS[HM_CODE_DSET] = Column(HM_CODE_DSET, [InListValidation(VALID_CODES), null_validation], allow_empty=True) +# FINAL_VALIDATORS[HM_INFO_DSET] = Column(HM_INFO_DSET, [CanConvertValidation(DSET_TYPES[HM_INFO_DSET]), null_validation], allow_empty=True) + + +#### Metadata entries #### + +FORMATTED_META_GENERIC = [ + '###PGS CATALOG SCORING FILE', + '#format_version', + '##POLYGENIC SCORE', + '#pgs_id', + '#pgs_name', + '#trait_reported', + '#trait_mapped', + '#trait_efo', + '#genome_build', + '#variants_number', + '#weight_type', + '##SOURCE INFORMATION', + '#pgp_id', + '#citation' +] + +HM_META_GENERIC = [ x for x in FORMATTED_META_GENERIC ] +HM_META_GENERIC.append('##HARMONIZATION DETAILS') + +HM_META_POS = [ x for x in HM_META_GENERIC ] +HM_META_POS.append('#HmPOS_build') +HM_META_POS.append('#HmPOS_date') +HM_META_POS.append('#HmPOS_match_chr') +HM_META_POS.append('#HmPOS_match_pos') + +# HM_META_FINAL = [ x for x in HM_META_GENERIC ] +# HM_META_FINAL.append('#Hm_file_version') +# HM_META_FINAL.append('#Hm_genome_build') +# HM_META_FINAL.append('#Hm_reference_source') +# HM_META_FINAL.append('#Hm_creation_date') +# HM_META_FINAL.append('#Hm_variants_number_matched') +# HM_META_FINAL.append('#Hm_variants_number_unmapped') \ No newline at end of file diff --git a/pgscatalog.validate/src/pgscatalog/validate/lib/validate_scorefile.py b/pgscatalog.validate/src/pgscatalog/validate/lib/validate_scorefile.py new file mode 100644 index 0000000..80294c3 --- /dev/null +++ b/pgscatalog.validate/src/pgscatalog/validate/lib/validate_scorefile.py @@ -0,0 +1,171 @@ +import os, glob, re +import argparse +import logging +import textwrap + +data_sum = {'valid': [], 'invalid': [], 'other': []} + +val_types = ('formatted', 'hm_pos') + +logging.basicConfig(level=logging.INFO, format='(%(levelname)s): %(message)s') + + +def validate_scorefile() -> None: + global data_sum, score_dir + args = _parse_args() + _check_args(args) + + # Check PGS Catalog file name nomenclature + check_filename = False + if args.check_filename: + check_filename = True + else: + print("WARNING: the parameter '--check_filename' is not present in the submitted command line, therefore the validation of the scoring file name(s) won't be performed.") + + validator_type = args.t + files_dir = args.dir + log_dir = args.log_dir + + ## Select validator class ## + if validator_type == 'formatted': + import pgscatalog_utils.validate.formatted.validator as validator_package + elif validator_type == 'hm_pos': + import pgscatalog_utils.validate.harmonized_position.validator as validator_package + + ## Run validator ## + # One file + if args.f: + _run_validator(args.f,log_dir,score_dir,validator_package,check_filename,validator_type) + # Content of the directory + elif files_dir: + count_files = 0 + # Browse directory: for each file run validator + for filepath in sorted(glob.glob(files_dir+"/*.*")): + _run_validator(filepath,log_dir,score_dir,validator_package,check_filename,validator_type) + count_files += 1 + + # Print summary + results + print("\nSummary:") + if data_sum['valid']: + print(f"- Valid: {len(data_sum['valid'])}/{count_files}") + if data_sum['invalid']: + print(f"- Invalid: {len(data_sum['invalid'])}/{count_files}") + if data_sum['other']: + print(f"- Other issues: {len(data_sum['other'])}/{count_files}") + + if data_sum['invalid']: + print("Invalid files:") + print("\n".join(data_sum['invalid'])) + + +def _read_last_line(file: str) -> str: + ''' + Return the last line of the file + ''' + fileHandle = open ( file,"r" ) + lineList = fileHandle.readlines() + fileHandle.close() + return lineList[-1] + + +def _file_validation_state(filename: str, log_file: str) -> None: + global data_sum + if os.path.exists(log_file): + log_result = _read_last_line(log_file) + if re.search("File is valid", log_result): + print("> valid\n") + data_sum['valid'].append(filename) + elif re.search("File is invalid", log_result): + print("#### invalid! ####\n") + data_sum['invalid'].append(filename) + else:# + print("!! validation process had an issue. Please look at the logs.\n") + data_sum['other'].append(filename) + else: + print("!! validation process had an issue: the log file can't be found") + data_sum['other'].append(filename) + + +def _check_args(args: argparse.Namespace) -> None: + global score_dir + + ## Check parameters ## + # Type of validator + if args.t not in val_types: + print(f"Error: Validator type (option -t) '{args.t}' is not in the list of recognized types: {val_types}.") + exit(1) + # Logs dir + if not os.path.isdir(args.log_dir): + print(f"Error: Log dir '{args.log_dir}' can't be found!") + exit(1) + # File and directory parameters (only one of the '-f' and '--dir' can be used) + if args.f and args.dir: + print("Error: you can't use both options [-f] - single scoring file and [--dir] - directory of scoring files. Please use only 1 of these 2 options!") + exit(1) + elif not args.f and not args.dir: + print("Error: you need to provide a scoring file [-f] or a directory of scoring files [--dir]!") + exit(1) + elif args.f and not os.path.isfile(args.f): + print(f"Error: Scoring file '{args.f}' can't be found!") + exit(1) + elif args.dir and not os.path.isdir(args.dir): + print(f"Error: the scoring file directory '{args.dir}' can't be found!") + exit(1) + # Scoring files directory (only to compare with the harmonized files) + score_dir = None + if args.score_dir: + score_dir = args.score_dir + if not os.path.isdir(score_dir): + print(f"Error: Scoring file directory '{score_dir}' can't be found!") + exit(1) + elif args.t != 'formatted': + print("WARNING: the parameter '--score_dir' is not present in the submitted command line, therefore the comparison of the number of data rows between the formatted scoring file(s) and the harmonized scoring file(s) won't be performed.") + + +def _run_validator(filepath: str, log_dir: str, score_dir: str, validator_package: object, check_filename: bool, validator_type: str) -> None: + ''' Run the file validator ''' + file = os.path.basename(filepath) + filename = file.split('.')[0] + print(f"# Filename: {file}") + log_file = f'{log_dir}/{filename}_log.txt' + + # Run validator + validator = validator_package.init_validator(filepath,log_file,score_dir) + if check_filename: + validator.run_validator() + else: + validator.run_validator_skip_check_filename() + + # Check log + _file_validation_state(file,log_file) + + +def _description_text() -> str: + return textwrap.dedent('''\ + Validate a set of scoring files to match the PGS Catalog scoring file formats. + It can validate: + - The formatted scoring file format (https://www.pgscatalog.org/downloads/#dl_ftp_scoring) + - The harmonized (Position) scoring file format (https://www.pgscatalog.org/downloads/#dl_ftp_scoring_hm_pos) + ''') + + +def _epilog_text() -> str: + return textwrap.dedent(f'''\ + You need to specify the type of file format to validate, using the paramter '-t' ({' or '.join(val_types)}). + ''') + + +def _parse_args(args=None) -> argparse.Namespace: + parser = argparse.ArgumentParser(description=_description_text(), epilog=_epilog_text(), + formatter_class=argparse.RawDescriptionHelpFormatter) + parser.add_argument("-t", help=f"Type of validator: {' or '.join(val_types)}", metavar='VALIDATOR_TYPE') + parser.add_argument("-f", help='The path to the polygenic scoring file to be validated (no need to use the [--dir] option)', metavar='SCORING_FILE_NAME') + parser.add_argument('--dir', help='The name of the directory containing the files that need to processed (no need to use the [-f] option') + parser.add_argument('--score_dir', help=' The name of the directory containing the formatted scoring files to compare with harmonized scoring files') + parser.add_argument('--log_dir', help='The name of the log directory where the log file(s) will be stored', required=True) + parser.add_argument('--check_filename', help=' Check that the file name match the PGS Catalog nomenclature', required=False, action='store_true') + return parser.parse_args(args) + + +if __name__ == '__main__': + validate_scorefile() diff --git a/pgscatalog.validate/src/pgscatalog/validate/lib/validator_base.py b/pgscatalog.validate/src/pgscatalog/validate/lib/validator_base.py new file mode 100644 index 0000000..6d9173d --- /dev/null +++ b/pgscatalog.validate/src/pgscatalog/validate/lib/validator_base.py @@ -0,0 +1,429 @@ +import os, sys, gc +import gzip +import csv +import pathlib +import logging +import re +from typing import List +import pandas as pd +import pandas_schema +import warnings +from .schemas import * + +''' +PGS Catalog file validator +- using pandas_schema https://github.com/TMiguelT/PandasSchema +''' + + +csv.field_size_limit(sys.maxsize) + +class ValidatorBase: + + valid_extensions = VALID_FILE_EXTENSIONS + schema_validators = GENERIC_VALIDATORS + valid_cols = [] + valid_type = '' + sep = '\t' + + def __init__(self, file, score_dir=None, logfile="VALIDATE.log", error_limit=0): + self.file = file + self.score_dir = score_dir + self.schema = None + self.header = [] + self.genomebuild = None + self.comment_lines_count = 1 # Counting the header line + self.cols_to_validate = [] + self.cols_to_read = [] + self.bad_rows = [] + self.row_errors = [] + self.errors_seen = {} + self.logfile = logfile + self.error_limit = int(error_limit) + self.is_valid = True + + # Logging variables + self.logger = logging.getLogger(__name__) + self.handler = logging.FileHandler(self.logfile, 'w+') + self.handler.setLevel(logging.INFO) + self.logger.addHandler(self.handler) + self.logger.propagate = False + + self.global_errors = 0 + self.variants_number = 0 + + + def validate_schema(self, schema: dict, dataframe_to_validate: pd.core.frame.DataFrame): + ''' + Run the pandas_schema validation using the provided Schema and DataFrame + ''' + self.schema = pandas_schema.Schema([schema[h] for h in self.cols_to_validate]) + with warnings.catch_warnings(): + # Ignore python warningd raised in the pandas_schema code + warnings.simplefilter('ignore', UserWarning) + errors = self.schema.validate(dataframe_to_validate) + self.store_errors(errors) + + + def setup_field_validation(self): + ''' + Fetch the header and build the list of column to check/validate + ''' + self.header = self.get_header() + self.cols_to_validate = [h for h in self.header if h in self.valid_cols] + self.cols_to_read = [h for h in self.header if h in self.valid_cols] + + + def get_header(self): + ''' + Fetch the header (i.e. column names) information from the harmonized scoring file and store the list in a variable + ''' + first_row = pd.read_csv(self.file, sep=self.sep, comment='#', nrows=1, index_col=False) + # Check if the column headers have leading and/or trailing spaces + # The leading/trailing spaces should raise an error during the header validation + has_trailing_spaces = self.check_leading_trailing_spaces(first_row.columns.values) + if has_trailing_spaces: + self.global_errors += 1 + return first_row.columns.values + + + def get_genomebuild(self): + ''' Retrieve the Genome Build from the comments ''' + if self.valid_type == 'hm_pos': + self.genomebuild = self.get_comments_info('#HmPOS_build') + else: + self.genomebuild = self.get_comments_info('#Hm_genome_build') + + + def get_pgs_id(self): + ''' Retrieve the PGS ID from the comments ''' + self.pgs_id = self.get_comments_info('#pgs_id') + + + def validate_content(self): + ''' Validate the file content and verify that the number of variant lines corresponds to the number of variants in the headers ''' + variant_lines_count = 0 + meta_lines_count = 0 + + with gzip.open( self.file, 'rb') as f: + line_number = 0 + file_meta = [] + for line in f: + line_number += 1 + line = line.decode('utf-8').rstrip() + # Check Metadata + if line.startswith('#'): + self.extract_specific_metadata(line) + # Check that we have all the meta information + for meta in self.meta_format: + if line.startswith(meta): + file_meta.append(meta) + meta_lines_count += 1 + break + + # Check data + else: + variant_lines_count += 1 + if re.search(r'\w+', line): # Line not empty + cols_content = line.split(self.sep) + has_trailing_spaces = self.check_leading_trailing_spaces(cols_content,line_number) + if has_trailing_spaces: + self.global_errors += 1 + + if line.startswith('rsID') or line.startswith('chr_name'): + continue + + self.validate_line_content(cols_content,variant_lines_count) + else: + self.logger.error(f'- Line {line_number} is empty') + self.global_errors += 1 + + # Compare the number of metadata lines: read vs expected + if meta_lines_count != len(self.meta_format): + self.logger.error(f'- The number of metadata lines [i.e. starting with the "#" character] in the file ({meta_lines_count}) and the expected number of metadata lines ({len(self.meta_format)}) are different') + diff_list = list(set(self.meta_format).difference(file_meta)) + self.logger.error(f" > Missing metadata line(s): {', '.join(diff_list)}") + self.global_errors += 1 + + + def validate_data(self) -> bool: + ''' Validate the file: data format and data content ''' + self.logger.info("Validating data...") + if not self.open_file_and_check_for_squareness(): + self.logger.error("Please fix the table. Some rows have different numbers of columns to the header") + self.logger.info("Rows with different numbers of columns to the header are not validated") + + # Validate data content and check the consitence between the declared variants number and the actual number of variants in the file + self.validate_content() + for chunk in self.df_iterator(self.file): + dataframe_to_validate = chunk[self.cols_to_read] + dataframe_to_validate.columns = self.cols_to_validate # sets the headers to standard format if neeeded + + # Schema validation + self.validate_schema(self.schema_validators,dataframe_to_validate) + + self.process_errors() + if len(self.bad_rows) >= self.error_limit: + break + + if not self.bad_rows and not self.global_errors and self.is_valid: + self.logger.info("File is valid") + else: + self.logger.info("File is invalid - {} bad rows, limit set to {}".format(len(self.bad_rows), self.error_limit)) + self.set_file_is_invalid() + return self.is_valid + + + def is_file_valid(self) -> bool: + ''' Method returning the boolean value: True if the file is valid, False if the file is invalid. ''' + return self.is_valid + + def set_file_is_invalid(self): + ''' Set the flag "is_valid" to False. ''' + self.is_valid = False + + + def process_errors(self): + ''' Populate the logger error and the list of bad rows with the errors found. ''' + for error in self.row_errors: + if len(self.bad_rows) < self.error_limit or self.error_limit < 1: + self.logger.error(error) + if error.row not in self.bad_rows: + self.bad_rows.append(error.row) + self.row_errors = [] + + + def store_errors(self, errors: List[pandas_schema.validation_warning.ValidationWarning]): + ''' Capture the errors found into a temporary structure before being processed. ''' + for error in errors: + seen = 0 + row_number = error.row + file_line_number = row_number + self.comment_lines_count + 1 # rows are 0 indexes + error.row = str(row_number) + " (line "+str(file_line_number)+")" + col = error.column + # Avoid duplication as the errors can be detected several times + if row_number in self.errors_seen.keys(): + if col in self.errors_seen[row_number].keys(): + seen = 1 + else: + self.errors_seen[row_number][col] = 1 + else: + self.errors_seen[row_number] = { col : 1 } + if seen == 0: + self.row_errors.append(error) + + + def validate_file_extension(self): + ''' Check/validate the file name extension. ''' + self.logger.info("Validating file extension...") + check_exts = [self.check_ext(ext) for ext in self.valid_extensions] + if not any(check_exts): + self.valid_ext = False + self.set_file_is_invalid() + self.logger.info("Invalid file extension: {}".format(self.file)) + self.logger.error("File extension should be in {}".format(self.valid_extensions)) + else: + self.valid_ext = True + return self.valid_ext + + + def compare_number_of_rows(self): + ''' Compare the number of data rows between the harmonized and the formatted scoring files. ''' + # Harmonization file - length + hm_rows_count = 0 + for chunk in self.df_iterator(self.file): + hm_rows_count += len(chunk.index) + gc.collect() + + # Formatted scoring file - length + scoring_rows_count = 0 + scoring_file = f'{self.score_dir}/{self.pgs_id}.txt.gz' + if os.path.isfile(scoring_file): + for score_chunk in self.df_iterator(scoring_file): + scoring_rows_count += len(score_chunk.index) + gc.collect() + + comparison_status = True + if scoring_rows_count == 0: + self.logger.error(f"Can't find the Scoring file '{scoring_file}' to compare the number of rows with the harmonization file!") + comparison_status = False + elif hm_rows_count != scoring_rows_count: + self.logger.error(f'The number of data rows between the Scoring file ({scoring_rows_count}) and the Harmonization POS file ({hm_rows_count}) are different') + comparison_status = False + return comparison_status + + + def compare_with_filename(self): + ''' Check that the filename matches the information present in the file metadata (PGS ID, genome build). ''' + self.logger.info("Comparing filename with metadata...") + comparison_status = True + if hasattr(self,'file_genomebuild') and hasattr(self,'file_pgs_id'): + # Extract some metadata + self.get_genomebuild() + self.get_pgs_id() + # Compare metadata with filename information + if self.file_genomebuild != self.genomebuild: + self.logger.error("Build: the genome build in the HmPOS_build header ({}) is different from the one on the filename ({})".format(self.genomebuild,self.file_genomebuild)) + comparison_status = False + if self.file_pgs_id != self.pgs_id: + self.logger.error("ID: the PGS ID of the header ({}) is different from the one on the filename ({})".format(self.pgs_id,self.file_pgs_id)) + comparison_status = False + # Compare number of rows with Scoring file + if self.score_dir: + row_comparison_status = self.compare_number_of_rows() + if row_comparison_status == False: + comparison_status = row_comparison_status + else: + self.logger.info("Comparison of the number of rows between Harmonized and Scoring file skipped!") + if not comparison_status: + self.logger.info("Discrepancies between filename information and metadata: {}".format(self.file)) + self.set_file_is_invalid() + return comparison_status + + + def df_iterator(self, data_file: str): + ''' Setup a pandas dataframe iterator. ''' + df = pd.read_csv(data_file, + sep=self.sep, + dtype=str, + comment='#', + chunksize=1000000) + return df + + + def check_file_is_square(self, csv_file: str): + ''' Check that each row has the name number of columns. ''' + square = True + csv_file.seek(0) + reader = csv.reader(csv_file, delimiter=self.sep) + count = 1 + for row in reader: + if len(row) != 0: + if row[0].startswith('#'): + self.comment_lines_count += 1 + continue + if (len(row) != len(self.header)): + self.logger.error("Length of row {c} is: {l} instead of {h}".format(c=count, l=str(len(row)), h=str(len(self.header)))) + self.logger.error("ROW: "+str(row)) + square = False + count += 1 + del csv_file + return square + + + def open_file_and_check_for_squareness(self): + ''' Method to read the file in order to check that each row has the name number of columns. ''' + if pathlib.Path(self.file).suffix in [".gz", ".gzip"]: + with gzip.open(self.file, 'rt') as f: + return self.check_file_is_square(f) + else: + with open(self.file) as f: + return self.check_file_is_square(f) + + + def check_leading_trailing_spaces(self, cols:str, line_number:str = None): + ''' + Check if the columns have leading and/or trailing spaces. + The leading/trailing spaces should raise an error during the validation. + ''' + leading_trailing_spaces = [] + found_trailing_spaces = False + for idx, col in enumerate(cols): + if col.startswith(' ') or col.endswith(' '): + leading_trailing_spaces.append(self.header[idx]+' => |'+str(col)+'|') + if len(leading_trailing_spaces): + if line_number: + line_name = f'line {line_number} has' + else: + line_name = 'following headers have' + self.logger.error("The "+line_name+" leading and/or trailing spaces: "+' ; '.join(leading_trailing_spaces)) + found_trailing_spaces = True + return found_trailing_spaces + + + def check_ext(self, ext:str) -> bool: + if self.file.endswith(ext): + return True + return False + + + def check_build_is_legit(self, build:str) -> bool: + if build in BUILD_LIST: + return True + return False + + + def get_comments_info(self, type:str) -> str: + ''' Retrieve information from the comments ''' + with gzip.open(self.file, 'rb') as f_in: + for f_line in f_in: + line = f_line.decode() + # Update header + if line.startswith(type): + info = (line.split('='))[1] + return info.strip() + + def run_generic_validator(self,check_filename): + self.logger.propagate = False + + # Check files exist + if not self.file or not self.logfile: + self.logger.info("Missing file and/or logfile") + self.set_file_is_invalid() + elif self.file and not os.path.exists(self.file): + self.logger.info("Error: the file '"+self.file+"' can't be found") + self.set_file_is_invalid() + + # Validate file extension + self.validate_file_extension() + + # Validate file name nomenclature + if self.is_file_valid() and check_filename: + self.validate_filename() + + # Only for harmonized files + if self.is_file_valid() and type(self).__name__ != 'ValidatorFormatted': + self.compare_with_filename() + + # Validate column headers + if self.is_file_valid(): + self.validate_headers() + + # Validate data content + if self.is_file_valid(): + self.validate_data() + + # Close log handler + self.logger.removeHandler(self.handler) + self.handler.close() + + def run_validator(self): + self.run_generic_validator(True) + + def run_validator_skip_check_filename(self): + self.run_generic_validator(False) + + + def validate_filename(self): + ''' Validate the file name structure. ''' + print("To be implemented in inherited classes") + pass + + + def validate_headers(self): + ''' Validate the list of column names. ''' + print("To be implemented in inherited classes") + pass + + + def validate_line_content(self, cols_content:str, var_line_number:int): + ''' Validate each data row. ''' + print("To be implemented in inherited classes") + pass + + + def extract_specific_metadata(self, line:str): + ''' Extra method to extract and validate specific data. ''' + print("To be implemented in inherited classes") + pass +