From c511efb52daca87bbc641e58a85a17d259c63f46 Mon Sep 17 00:00:00 2001
From: Florent Yvon <florent.yvon76@gmail.com>
Date: Tue, 20 Aug 2024 19:53:40 +0100
Subject: [PATCH] Added legacy pgscatalog_utils scoring file validator to
 pygscatalog.validate

---
 pgscatalog.validate/LICENSE                   | 201 ++++++++
 pgscatalog.validate/README.md                 |  10 +
 pgscatalog.validate/poetry.lock               | 214 +++++++++
 pgscatalog.validate/poetry.toml               |   3 +
 pgscatalog.validate/pyproject.toml            |  29 ++
 .../src/pgscatalog/validate/__init__.py       |   0
 .../src/pgscatalog/validate/cli/__init__.py   |   0
 .../validate/cli/validate_scorefile.py        | 171 +++++++
 .../src/pgscatalog/validate/lib/__init__.py   |   0
 .../validate/lib/common_constants.py          |  44 ++
 .../validate/lib/formatted/__init__.py        |   0
 .../validate/lib/formatted/validator.py       | 197 ++++++++
 .../lib/harmonized_position/__init__.py       |   0
 .../lib/harmonized_position/validator.py      |  98 ++++
 .../src/pgscatalog/validate/lib/helpers.py    |  29 ++
 .../src/pgscatalog/validate/lib/schemas.py    | 157 +++++++
 .../validate/lib/validate_scorefile.py        | 171 +++++++
 .../pgscatalog/validate/lib/validator_base.py | 429 ++++++++++++++++++
 18 files changed, 1753 insertions(+)
 create mode 100644 pgscatalog.validate/LICENSE
 create mode 100644 pgscatalog.validate/README.md
 create mode 100644 pgscatalog.validate/poetry.lock
 create mode 100644 pgscatalog.validate/poetry.toml
 create mode 100644 pgscatalog.validate/pyproject.toml
 create mode 100644 pgscatalog.validate/src/pgscatalog/validate/__init__.py
 create mode 100644 pgscatalog.validate/src/pgscatalog/validate/cli/__init__.py
 create mode 100644 pgscatalog.validate/src/pgscatalog/validate/cli/validate_scorefile.py
 create mode 100644 pgscatalog.validate/src/pgscatalog/validate/lib/__init__.py
 create mode 100644 pgscatalog.validate/src/pgscatalog/validate/lib/common_constants.py
 create mode 100644 pgscatalog.validate/src/pgscatalog/validate/lib/formatted/__init__.py
 create mode 100644 pgscatalog.validate/src/pgscatalog/validate/lib/formatted/validator.py
 create mode 100644 pgscatalog.validate/src/pgscatalog/validate/lib/harmonized_position/__init__.py
 create mode 100644 pgscatalog.validate/src/pgscatalog/validate/lib/harmonized_position/validator.py
 create mode 100644 pgscatalog.validate/src/pgscatalog/validate/lib/helpers.py
 create mode 100644 pgscatalog.validate/src/pgscatalog/validate/lib/schemas.py
 create mode 100644 pgscatalog.validate/src/pgscatalog/validate/lib/validate_scorefile.py
 create mode 100644 pgscatalog.validate/src/pgscatalog/validate/lib/validator_base.py

diff --git a/pgscatalog.validate/LICENSE b/pgscatalog.validate/LICENSE
new file mode 100644
index 0000000..261eeb9
--- /dev/null
+++ b/pgscatalog.validate/LICENSE
@@ -0,0 +1,201 @@
+                                 Apache License
+                           Version 2.0, January 2004
+                        http://www.apache.org/licenses/
+
+   TERMS AND CONDITIONS FOR USE, REPRODUCTION, AND DISTRIBUTION
+
+   1. Definitions.
+
+      "License" shall mean the terms and conditions for use, reproduction,
+      and distribution as defined by Sections 1 through 9 of this document.
+
+      "Licensor" shall mean the copyright owner or entity authorized by
+      the copyright owner that is granting the License.
+
+      "Legal Entity" shall mean the union of the acting entity and all
+      other entities that control, are controlled by, or are under common
+      control with that entity. For the purposes of this definition,
+      "control" means (i) the power, direct or indirect, to cause the
+      direction or management of such entity, whether by contract or
+      otherwise, or (ii) ownership of fifty percent (50%) or more of the
+      outstanding shares, or (iii) beneficial ownership of such entity.
+
+      "You" (or "Your") shall mean an individual or Legal Entity
+      exercising permissions granted by this License.
+
+      "Source" form shall mean the preferred form for making modifications,
+      including but not limited to software source code, documentation
+      source, and configuration files.
+
+      "Object" form shall mean any form resulting from mechanical
+      transformation or translation of a Source form, including but
+      not limited to compiled object code, generated documentation,
+      and conversions to other media types.
+
+      "Work" shall mean the work of authorship, whether in Source or
+      Object form, made available under the License, as indicated by a
+      copyright notice that is included in or attached to the work
+      (an example is provided in the Appendix below).
+
+      "Derivative Works" shall mean any work, whether in Source or Object
+      form, that is based on (or derived from) the Work and for which the
+      editorial revisions, annotations, elaborations, or other modifications
+      represent, as a whole, an original work of authorship. For the purposes
+      of this License, Derivative Works shall not include works that remain
+      separable from, or merely link (or bind by name) to the interfaces of,
+      the Work and Derivative Works thereof.
+
+      "Contribution" shall mean any work of authorship, including
+      the original version of the Work and any modifications or additions
+      to that Work or Derivative Works thereof, that is intentionally
+      submitted to Licensor for inclusion in the Work by the copyright owner
+      or by an individual or Legal Entity authorized to submit on behalf of
+      the copyright owner. For the purposes of this definition, "submitted"
+      means any form of electronic, verbal, or written communication sent
+      to the Licensor or its representatives, including but not limited to
+      communication on electronic mailing lists, source code control systems,
+      and issue tracking systems that are managed by, or on behalf of, the
+      Licensor for the purpose of discussing and improving the Work, but
+      excluding communication that is conspicuously marked or otherwise
+      designated in writing by the copyright owner as "Not a Contribution."
+
+      "Contributor" shall mean Licensor and any individual or Legal Entity
+      on behalf of whom a Contribution has been received by Licensor and
+      subsequently incorporated within the Work.
+
+   2. Grant of Copyright License. Subject to the terms and conditions of
+      this License, each Contributor hereby grants to You a perpetual,
+      worldwide, non-exclusive, no-charge, royalty-free, irrevocable
+      copyright license to reproduce, prepare Derivative Works of,
+      publicly display, publicly perform, sublicense, and distribute the
+      Work and such Derivative Works in Source or Object form.
+
+   3. Grant of Patent License. Subject to the terms and conditions of
+      this License, each Contributor hereby grants to You a perpetual,
+      worldwide, non-exclusive, no-charge, royalty-free, irrevocable
+      (except as stated in this section) patent license to make, have made,
+      use, offer to sell, sell, import, and otherwise transfer the Work,
+      where such license applies only to those patent claims licensable
+      by such Contributor that are necessarily infringed by their
+      Contribution(s) alone or by combination of their Contribution(s)
+      with the Work to which such Contribution(s) was submitted. If You
+      institute patent litigation against any entity (including a
+      cross-claim or counterclaim in a lawsuit) alleging that the Work
+      or a Contribution incorporated within the Work constitutes direct
+      or contributory patent infringement, then any patent licenses
+      granted to You under this License for that Work shall terminate
+      as of the date such litigation is filed.
+
+   4. Redistribution. You may reproduce and distribute copies of the
+      Work or Derivative Works thereof in any medium, with or without
+      modifications, and in Source or Object form, provided that You
+      meet the following conditions:
+
+      (a) You must give any other recipients of the Work or
+          Derivative Works a copy of this License; and
+
+      (b) You must cause any modified files to carry prominent notices
+          stating that You changed the files; and
+
+      (c) You must retain, in the Source form of any Derivative Works
+          that You distribute, all copyright, patent, trademark, and
+          attribution notices from the Source form of the Work,
+          excluding those notices that do not pertain to any part of
+          the Derivative Works; and
+
+      (d) If the Work includes a "NOTICE" text file as part of its
+          distribution, then any Derivative Works that You distribute must
+          include a readable copy of the attribution notices contained
+          within such NOTICE file, excluding those notices that do not
+          pertain to any part of the Derivative Works, in at least one
+          of the following places: within a NOTICE text file distributed
+          as part of the Derivative Works; within the Source form or
+          documentation, if provided along with the Derivative Works; or,
+          within a display generated by the Derivative Works, if and
+          wherever such third-party notices normally appear. The contents
+          of the NOTICE file are for informational purposes only and
+          do not modify the License. You may add Your own attribution
+          notices within Derivative Works that You distribute, alongside
+          or as an addendum to the NOTICE text from the Work, provided
+          that such additional attribution notices cannot be construed
+          as modifying the License.
+
+      You may add Your own copyright statement to Your modifications and
+      may provide additional or different license terms and conditions
+      for use, reproduction, or distribution of Your modifications, or
+      for any such Derivative Works as a whole, provided Your use,
+      reproduction, and distribution of the Work otherwise complies with
+      the conditions stated in this License.
+
+   5. Submission of Contributions. Unless You explicitly state otherwise,
+      any Contribution intentionally submitted for inclusion in the Work
+      by You to the Licensor shall be under the terms and conditions of
+      this License, without any additional terms or conditions.
+      Notwithstanding the above, nothing herein shall supersede or modify
+      the terms of any separate license agreement you may have executed
+      with Licensor regarding such Contributions.
+
+   6. Trademarks. This License does not grant permission to use the trade
+      names, trademarks, service marks, or product names of the Licensor,
+      except as required for reasonable and customary use in describing the
+      origin of the Work and reproducing the content of the NOTICE file.
+
+   7. Disclaimer of Warranty. Unless required by applicable law or
+      agreed to in writing, Licensor provides the Work (and each
+      Contributor provides its Contributions) on an "AS IS" BASIS,
+      WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or
+      implied, including, without limitation, any warranties or conditions
+      of TITLE, NON-INFRINGEMENT, MERCHANTABILITY, or FITNESS FOR A
+      PARTICULAR PURPOSE. You are solely responsible for determining the
+      appropriateness of using or redistributing the Work and assume any
+      risks associated with Your exercise of permissions under this License.
+
+   8. Limitation of Liability. In no event and under no legal theory,
+      whether in tort (including negligence), contract, or otherwise,
+      unless required by applicable law (such as deliberate and grossly
+      negligent acts) or agreed to in writing, shall any Contributor be
+      liable to You for damages, including any direct, indirect, special,
+      incidental, or consequential damages of any character arising as a
+      result of this License or out of the use or inability to use the
+      Work (including but not limited to damages for loss of goodwill,
+      work stoppage, computer failure or malfunction, or any and all
+      other commercial damages or losses), even if such Contributor
+      has been advised of the possibility of such damages.
+
+   9. Accepting Warranty or Additional Liability. While redistributing
+      the Work or Derivative Works thereof, You may choose to offer,
+      and charge a fee for, acceptance of support, warranty, indemnity,
+      or other liability obligations and/or rights consistent with this
+      License. However, in accepting such obligations, You may act only
+      on Your own behalf and on Your sole responsibility, not on behalf
+      of any other Contributor, and only if You agree to indemnify,
+      defend, and hold each Contributor harmless for any liability
+      incurred by, or claims asserted against, such Contributor by reason
+      of your accepting any such warranty or additional liability.
+
+   END OF TERMS AND CONDITIONS
+
+   APPENDIX: How to apply the Apache License to your work.
+
+      To apply the Apache License to your work, attach the following
+      boilerplate notice, with the fields enclosed by brackets "[]"
+      replaced with your own identifying information. (Don't include
+      the brackets!)  The text should be enclosed in the appropriate
+      comment syntax for the file format. We also recommend that a
+      file or class name and description of purpose be included on the
+      same "printed page" as the copyright notice for easier
+      identification within third-party archives.
+
+   Copyright [yyyy] [name of copyright owner]
+
+   Licensed under the Apache License, Version 2.0 (the "License");
+   you may not use this file except in compliance with the License.
+   You may obtain a copy of the License at
+
+       http://www.apache.org/licenses/LICENSE-2.0
+
+   Unless required by applicable law or agreed to in writing, software
+   distributed under the License is distributed on an "AS IS" BASIS,
+   WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+   See the License for the specific language governing permissions and
+   limitations under the License.
diff --git a/pgscatalog.validate/README.md b/pgscatalog.validate/README.md
new file mode 100644
index 0000000..32f20b9
--- /dev/null
+++ b/pgscatalog.validate/README.md
@@ -0,0 +1,10 @@
+# `pgscatalog.validate`
+
+This Python package contains:
+
+* CLI applications to check/validate that the scoring files and harmonized scoring files match the PGS Catalog scoring file formats
+* library classes and functions for working with scoring file validation
+
+| Application           | Description            | Link                  |
+|-----------------------|------------------------|-----------------------|
+| `pgscatalog-validate` | Validate scoring files | [README](missing_url) |
diff --git a/pgscatalog.validate/poetry.lock b/pgscatalog.validate/poetry.lock
new file mode 100644
index 0000000..5db99d6
--- /dev/null
+++ b/pgscatalog.validate/poetry.lock
@@ -0,0 +1,214 @@
+# This file is automatically @generated by Poetry 1.8.3 and should not be changed by hand.
+
+[[package]]
+name = "numpy"
+version = "2.1.0"
+description = "Fundamental package for array computing in Python"
+optional = false
+python-versions = ">=3.10"
+files = [
+    {file = "numpy-2.1.0-cp310-cp310-macosx_10_9_x86_64.whl", hash = "sha256:6326ab99b52fafdcdeccf602d6286191a79fe2fda0ae90573c5814cd2b0bc1b8"},
+    {file = "numpy-2.1.0-cp310-cp310-macosx_11_0_arm64.whl", hash = "sha256:0937e54c09f7a9a68da6889362ddd2ff584c02d015ec92672c099b61555f8911"},
+    {file = "numpy-2.1.0-cp310-cp310-macosx_14_0_arm64.whl", hash = "sha256:30014b234f07b5fec20f4146f69e13cfb1e33ee9a18a1879a0142fbb00d47673"},
+    {file = "numpy-2.1.0-cp310-cp310-macosx_14_0_x86_64.whl", hash = "sha256:899da829b362ade41e1e7eccad2cf274035e1cb36ba73034946fccd4afd8606b"},
+    {file = "numpy-2.1.0-cp310-cp310-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:08801848a40aea24ce16c2ecde3b756f9ad756586fb2d13210939eb69b023f5b"},
+    {file = "numpy-2.1.0-cp310-cp310-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:398049e237d1aae53d82a416dade04defed1a47f87d18d5bd615b6e7d7e41d1f"},
+    {file = "numpy-2.1.0-cp310-cp310-musllinux_1_1_x86_64.whl", hash = "sha256:0abb3916a35d9090088a748636b2c06dc9a6542f99cd476979fb156a18192b84"},
+    {file = "numpy-2.1.0-cp310-cp310-musllinux_1_2_aarch64.whl", hash = "sha256:10e2350aea18d04832319aac0f887d5fcec1b36abd485d14f173e3e900b83e33"},
+    {file = "numpy-2.1.0-cp310-cp310-win32.whl", hash = "sha256:f6b26e6c3b98adb648243670fddc8cab6ae17473f9dc58c51574af3e64d61211"},
+    {file = "numpy-2.1.0-cp310-cp310-win_amd64.whl", hash = "sha256:f505264735ee074250a9c78247ee8618292091d9d1fcc023290e9ac67e8f1afa"},
+    {file = "numpy-2.1.0-cp311-cp311-macosx_10_9_x86_64.whl", hash = "sha256:76368c788ccb4f4782cf9c842b316140142b4cbf22ff8db82724e82fe1205dce"},
+    {file = "numpy-2.1.0-cp311-cp311-macosx_14_0_arm64.whl", hash = "sha256:f8e93a01a35be08d31ae33021e5268f157a2d60ebd643cfc15de6ab8e4722eb1"},
+    {file = "numpy-2.1.0-cp311-cp311-macosx_14_0_x86_64.whl", hash = "sha256:9523f8b46485db6939bd069b28b642fec86c30909cea90ef550373787f79530e"},
+    {file = "numpy-2.1.0-cp311-cp311-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:54139e0eb219f52f60656d163cbe67c31ede51d13236c950145473504fa208cb"},
+    {file = "numpy-2.1.0-cp311-cp311-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:f5ebbf9fbdabed208d4ecd2e1dfd2c0741af2f876e7ae522c2537d404ca895c3"},
+    {file = "numpy-2.1.0-cp311-cp311-musllinux_1_1_x86_64.whl", hash = "sha256:378cb4f24c7d93066ee4103204f73ed046eb88f9ad5bb2275bb9fa0f6a02bd36"},
+    {file = "numpy-2.1.0-cp311-cp311-musllinux_1_2_aarch64.whl", hash = "sha256:d8f699a709120b220dfe173f79c73cb2a2cab2c0b88dd59d7b49407d032b8ebd"},
+    {file = "numpy-2.1.0-cp311-cp311-win32.whl", hash = "sha256:ffbd6faeb190aaf2b5e9024bac9622d2ee549b7ec89ef3a9373fa35313d44e0e"},
+    {file = "numpy-2.1.0-cp311-cp311-win_amd64.whl", hash = "sha256:0af3a5987f59d9c529c022c8c2a64805b339b7ef506509fba7d0556649b9714b"},
+    {file = "numpy-2.1.0-cp312-cp312-macosx_10_9_x86_64.whl", hash = "sha256:fe76d75b345dc045acdbc006adcb197cc680754afd6c259de60d358d60c93736"},
+    {file = "numpy-2.1.0-cp312-cp312-macosx_11_0_arm64.whl", hash = "sha256:f358ea9e47eb3c2d6eba121ab512dfff38a88db719c38d1e67349af210bc7529"},
+    {file = "numpy-2.1.0-cp312-cp312-macosx_14_0_arm64.whl", hash = "sha256:dd94ce596bda40a9618324547cfaaf6650b1a24f5390350142499aa4e34e53d1"},
+    {file = "numpy-2.1.0-cp312-cp312-macosx_14_0_x86_64.whl", hash = "sha256:b47c551c6724960479cefd7353656498b86e7232429e3a41ab83be4da1b109e8"},
+    {file = "numpy-2.1.0-cp312-cp312-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:a0756a179afa766ad7cb6f036de622e8a8f16ffdd55aa31f296c870b5679d745"},
+    {file = "numpy-2.1.0-cp312-cp312-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:24003ba8ff22ea29a8c306e61d316ac74111cebf942afbf692df65509a05f111"},
+    {file = "numpy-2.1.0-cp312-cp312-musllinux_1_1_x86_64.whl", hash = "sha256:b34fa5e3b5d6dc7e0a4243fa0f81367027cb6f4a7215a17852979634b5544ee0"},
+    {file = "numpy-2.1.0-cp312-cp312-musllinux_1_2_aarch64.whl", hash = "sha256:c4f982715e65036c34897eb598d64aef15150c447be2cfc6643ec7a11af06574"},
+    {file = "numpy-2.1.0-cp312-cp312-win32.whl", hash = "sha256:c4cd94dfefbefec3f8b544f61286584292d740e6e9d4677769bc76b8f41deb02"},
+    {file = "numpy-2.1.0-cp312-cp312-win_amd64.whl", hash = "sha256:a0cdef204199278f5c461a0bed6ed2e052998276e6d8ab2963d5b5c39a0500bc"},
+    {file = "numpy-2.1.0-cp313-cp313-macosx_10_13_x86_64.whl", hash = "sha256:8ab81ccd753859ab89e67199b9da62c543850f819993761c1e94a75a814ed667"},
+    {file = "numpy-2.1.0-cp313-cp313-macosx_11_0_arm64.whl", hash = "sha256:442596f01913656d579309edcd179a2a2f9977d9a14ff41d042475280fc7f34e"},
+    {file = "numpy-2.1.0-cp313-cp313-macosx_14_0_arm64.whl", hash = "sha256:848c6b5cad9898e4b9ef251b6f934fa34630371f2e916261070a4eb9092ffd33"},
+    {file = "numpy-2.1.0-cp313-cp313-macosx_14_0_x86_64.whl", hash = "sha256:54c6a63e9d81efe64bfb7bcb0ec64332a87d0b87575f6009c8ba67ea6374770b"},
+    {file = "numpy-2.1.0-cp313-cp313-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:652e92fc409e278abdd61e9505649e3938f6d04ce7ef1953f2ec598a50e7c195"},
+    {file = "numpy-2.1.0-cp313-cp313-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:0ab32eb9170bf8ffcbb14f11613f4a0b108d3ffee0832457c5d4808233ba8977"},
+    {file = "numpy-2.1.0-cp313-cp313-musllinux_1_1_x86_64.whl", hash = "sha256:8fb49a0ba4d8f41198ae2d52118b050fd34dace4b8f3fb0ee34e23eb4ae775b1"},
+    {file = "numpy-2.1.0-cp313-cp313-musllinux_1_2_aarch64.whl", hash = "sha256:44e44973262dc3ae79e9063a1284a73e09d01b894b534a769732ccd46c28cc62"},
+    {file = "numpy-2.1.0-cp313-cp313-win32.whl", hash = "sha256:ab83adc099ec62e044b1fbb3a05499fa1e99f6d53a1dde102b2d85eff66ed324"},
+    {file = "numpy-2.1.0-cp313-cp313-win_amd64.whl", hash = "sha256:de844aaa4815b78f6023832590d77da0e3b6805c644c33ce94a1e449f16d6ab5"},
+    {file = "numpy-2.1.0-cp313-cp313t-macosx_10_13_x86_64.whl", hash = "sha256:343e3e152bf5a087511cd325e3b7ecfd5b92d369e80e74c12cd87826e263ec06"},
+    {file = "numpy-2.1.0-cp313-cp313t-macosx_11_0_arm64.whl", hash = "sha256:f07fa2f15dabe91259828ce7d71b5ca9e2eb7c8c26baa822c825ce43552f4883"},
+    {file = "numpy-2.1.0-cp313-cp313t-macosx_14_0_arm64.whl", hash = "sha256:5474dad8c86ee9ba9bb776f4b99ef2d41b3b8f4e0d199d4f7304728ed34d0300"},
+    {file = "numpy-2.1.0-cp313-cp313t-macosx_14_0_x86_64.whl", hash = "sha256:1f817c71683fd1bb5cff1529a1d085a57f02ccd2ebc5cd2c566f9a01118e3b7d"},
+    {file = "numpy-2.1.0-cp313-cp313t-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:3a3336fbfa0d38d3deacd3fe7f3d07e13597f29c13abf4d15c3b6dc2291cbbdd"},
+    {file = "numpy-2.1.0-cp313-cp313t-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:7a894c51fd8c4e834f00ac742abad73fc485df1062f1b875661a3c1e1fb1c2f6"},
+    {file = "numpy-2.1.0-cp313-cp313t-musllinux_1_1_x86_64.whl", hash = "sha256:9156ca1f79fc4acc226696e95bfcc2b486f165a6a59ebe22b2c1f82ab190384a"},
+    {file = "numpy-2.1.0-cp313-cp313t-musllinux_1_2_aarch64.whl", hash = "sha256:624884b572dff8ca8f60fab591413f077471de64e376b17d291b19f56504b2bb"},
+    {file = "numpy-2.1.0-pp310-pypy310_pp73-macosx_10_15_x86_64.whl", hash = "sha256:15ef8b2177eeb7e37dd5ef4016f30b7659c57c2c0b57a779f1d537ff33a72c7b"},
+    {file = "numpy-2.1.0-pp310-pypy310_pp73-macosx_14_0_x86_64.whl", hash = "sha256:e5f0642cdf4636198a4990de7a71b693d824c56a757862230454629cf62e323d"},
+    {file = "numpy-2.1.0-pp310-pypy310_pp73-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:f15976718c004466406342789f31b6673776360f3b1e3c575f25302d7e789575"},
+    {file = "numpy-2.1.0-pp310-pypy310_pp73-win_amd64.whl", hash = "sha256:6c1de77ded79fef664d5098a66810d4d27ca0224e9051906e634b3f7ead134c2"},
+    {file = "numpy-2.1.0.tar.gz", hash = "sha256:7dc90da0081f7e1da49ec4e398ede6a8e9cc4f5ebe5f9e06b443ed889ee9aaa2"},
+]
+
+[[package]]
+name = "packaging"
+version = "24.1"
+description = "Core utilities for Python packages"
+optional = false
+python-versions = ">=3.8"
+files = [
+    {file = "packaging-24.1-py3-none-any.whl", hash = "sha256:5b8f2217dbdbd2f7f384c41c628544e6d52f2d0f53c6d0c3ea61aa5d1d7ff124"},
+    {file = "packaging-24.1.tar.gz", hash = "sha256:026ed72c8ed3fcce5bf8950572258698927fd1dbda10a5e981cdf0ac37f4f002"},
+]
+
+[[package]]
+name = "pandas"
+version = "2.2.2"
+description = "Powerful data structures for data analysis, time series, and statistics"
+optional = false
+python-versions = ">=3.9"
+files = [
+    {file = "pandas-2.2.2-cp310-cp310-macosx_10_9_x86_64.whl", hash = "sha256:90c6fca2acf139569e74e8781709dccb6fe25940488755716d1d354d6bc58bce"},
+    {file = "pandas-2.2.2-cp310-cp310-macosx_11_0_arm64.whl", hash = "sha256:c7adfc142dac335d8c1e0dcbd37eb8617eac386596eb9e1a1b77791cf2498238"},
+    {file = "pandas-2.2.2-cp310-cp310-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:4abfe0be0d7221be4f12552995e58723c7422c80a659da13ca382697de830c08"},
+    {file = "pandas-2.2.2-cp310-cp310-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:8635c16bf3d99040fdf3ca3db669a7250ddf49c55dc4aa8fe0ae0fa8d6dcc1f0"},
+    {file = "pandas-2.2.2-cp310-cp310-musllinux_1_1_aarch64.whl", hash = "sha256:40ae1dffb3967a52203105a077415a86044a2bea011b5f321c6aa64b379a3f51"},
+    {file = "pandas-2.2.2-cp310-cp310-musllinux_1_1_x86_64.whl", hash = "sha256:8e5a0b00e1e56a842f922e7fae8ae4077aee4af0acb5ae3622bd4b4c30aedf99"},
+    {file = "pandas-2.2.2-cp310-cp310-win_amd64.whl", hash = "sha256:ddf818e4e6c7c6f4f7c8a12709696d193976b591cc7dc50588d3d1a6b5dc8772"},
+    {file = "pandas-2.2.2-cp311-cp311-macosx_10_9_x86_64.whl", hash = "sha256:696039430f7a562b74fa45f540aca068ea85fa34c244d0deee539cb6d70aa288"},
+    {file = "pandas-2.2.2-cp311-cp311-macosx_11_0_arm64.whl", hash = "sha256:8e90497254aacacbc4ea6ae5e7a8cd75629d6ad2b30025a4a8b09aa4faf55151"},
+    {file = "pandas-2.2.2-cp311-cp311-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:58b84b91b0b9f4bafac2a0ac55002280c094dfc6402402332c0913a59654ab2b"},
+    {file = "pandas-2.2.2-cp311-cp311-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:6d2123dc9ad6a814bcdea0f099885276b31b24f7edf40f6cdbc0912672e22eee"},
+    {file = "pandas-2.2.2-cp311-cp311-musllinux_1_1_aarch64.whl", hash = "sha256:2925720037f06e89af896c70bca73459d7e6a4be96f9de79e2d440bd499fe0db"},
+    {file = "pandas-2.2.2-cp311-cp311-musllinux_1_1_x86_64.whl", hash = "sha256:0cace394b6ea70c01ca1595f839cf193df35d1575986e484ad35c4aeae7266c1"},
+    {file = "pandas-2.2.2-cp311-cp311-win_amd64.whl", hash = "sha256:873d13d177501a28b2756375d59816c365e42ed8417b41665f346289adc68d24"},
+    {file = "pandas-2.2.2-cp312-cp312-macosx_10_9_x86_64.whl", hash = "sha256:9dfde2a0ddef507a631dc9dc4af6a9489d5e2e740e226ad426a05cabfbd7c8ef"},
+    {file = "pandas-2.2.2-cp312-cp312-macosx_11_0_arm64.whl", hash = "sha256:e9b79011ff7a0f4b1d6da6a61aa1aa604fb312d6647de5bad20013682d1429ce"},
+    {file = "pandas-2.2.2-cp312-cp312-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:1cb51fe389360f3b5a4d57dbd2848a5f033350336ca3b340d1c53a1fad33bcad"},
+    {file = "pandas-2.2.2-cp312-cp312-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:eee3a87076c0756de40b05c5e9a6069c035ba43e8dd71c379e68cab2c20f16ad"},
+    {file = "pandas-2.2.2-cp312-cp312-musllinux_1_1_aarch64.whl", hash = "sha256:3e374f59e440d4ab45ca2fffde54b81ac3834cf5ae2cdfa69c90bc03bde04d76"},
+    {file = "pandas-2.2.2-cp312-cp312-musllinux_1_1_x86_64.whl", hash = "sha256:43498c0bdb43d55cb162cdc8c06fac328ccb5d2eabe3cadeb3529ae6f0517c32"},
+    {file = "pandas-2.2.2-cp312-cp312-win_amd64.whl", hash = "sha256:d187d355ecec3629624fccb01d104da7d7f391db0311145817525281e2804d23"},
+    {file = "pandas-2.2.2-cp39-cp39-macosx_10_9_x86_64.whl", hash = "sha256:0ca6377b8fca51815f382bd0b697a0814c8bda55115678cbc94c30aacbb6eff2"},
+    {file = "pandas-2.2.2-cp39-cp39-macosx_11_0_arm64.whl", hash = "sha256:9057e6aa78a584bc93a13f0a9bf7e753a5e9770a30b4d758b8d5f2a62a9433cd"},
+    {file = "pandas-2.2.2-cp39-cp39-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:001910ad31abc7bf06f49dcc903755d2f7f3a9186c0c040b827e522e9cef0863"},
+    {file = "pandas-2.2.2-cp39-cp39-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:66b479b0bd07204e37583c191535505410daa8df638fd8e75ae1b383851fe921"},
+    {file = "pandas-2.2.2-cp39-cp39-musllinux_1_1_aarch64.whl", hash = "sha256:a77e9d1c386196879aa5eb712e77461aaee433e54c68cf253053a73b7e49c33a"},
+    {file = "pandas-2.2.2-cp39-cp39-musllinux_1_1_x86_64.whl", hash = "sha256:92fd6b027924a7e178ac202cfbe25e53368db90d56872d20ffae94b96c7acc57"},
+    {file = "pandas-2.2.2-cp39-cp39-win_amd64.whl", hash = "sha256:640cef9aa381b60e296db324337a554aeeb883ead99dc8f6c18e81a93942f5f4"},
+    {file = "pandas-2.2.2.tar.gz", hash = "sha256:9e79019aba43cb4fda9e4d983f8e88ca0373adbb697ae9c6c43093218de28b54"},
+]
+
+[package.dependencies]
+numpy = [
+    {version = ">=1.22.4", markers = "python_version < \"3.11\""},
+    {version = ">=1.23.2", markers = "python_version == \"3.11\""},
+    {version = ">=1.26.0", markers = "python_version >= \"3.12\""},
+]
+python-dateutil = ">=2.8.2"
+pytz = ">=2020.1"
+tzdata = ">=2022.7"
+
+[package.extras]
+all = ["PyQt5 (>=5.15.9)", "SQLAlchemy (>=2.0.0)", "adbc-driver-postgresql (>=0.8.0)", "adbc-driver-sqlite (>=0.8.0)", "beautifulsoup4 (>=4.11.2)", "bottleneck (>=1.3.6)", "dataframe-api-compat (>=0.1.7)", "fastparquet (>=2022.12.0)", "fsspec (>=2022.11.0)", "gcsfs (>=2022.11.0)", "html5lib (>=1.1)", "hypothesis (>=6.46.1)", "jinja2 (>=3.1.2)", "lxml (>=4.9.2)", "matplotlib (>=3.6.3)", "numba (>=0.56.4)", "numexpr (>=2.8.4)", "odfpy (>=1.4.1)", "openpyxl (>=3.1.0)", "pandas-gbq (>=0.19.0)", "psycopg2 (>=2.9.6)", "pyarrow (>=10.0.1)", "pymysql (>=1.0.2)", "pyreadstat (>=1.2.0)", "pytest (>=7.3.2)", "pytest-xdist (>=2.2.0)", "python-calamine (>=0.1.7)", "pyxlsb (>=1.0.10)", "qtpy (>=2.3.0)", "s3fs (>=2022.11.0)", "scipy (>=1.10.0)", "tables (>=3.8.0)", "tabulate (>=0.9.0)", "xarray (>=2022.12.0)", "xlrd (>=2.0.1)", "xlsxwriter (>=3.0.5)", "zstandard (>=0.19.0)"]
+aws = ["s3fs (>=2022.11.0)"]
+clipboard = ["PyQt5 (>=5.15.9)", "qtpy (>=2.3.0)"]
+compression = ["zstandard (>=0.19.0)"]
+computation = ["scipy (>=1.10.0)", "xarray (>=2022.12.0)"]
+consortium-standard = ["dataframe-api-compat (>=0.1.7)"]
+excel = ["odfpy (>=1.4.1)", "openpyxl (>=3.1.0)", "python-calamine (>=0.1.7)", "pyxlsb (>=1.0.10)", "xlrd (>=2.0.1)", "xlsxwriter (>=3.0.5)"]
+feather = ["pyarrow (>=10.0.1)"]
+fss = ["fsspec (>=2022.11.0)"]
+gcp = ["gcsfs (>=2022.11.0)", "pandas-gbq (>=0.19.0)"]
+hdf5 = ["tables (>=3.8.0)"]
+html = ["beautifulsoup4 (>=4.11.2)", "html5lib (>=1.1)", "lxml (>=4.9.2)"]
+mysql = ["SQLAlchemy (>=2.0.0)", "pymysql (>=1.0.2)"]
+output-formatting = ["jinja2 (>=3.1.2)", "tabulate (>=0.9.0)"]
+parquet = ["pyarrow (>=10.0.1)"]
+performance = ["bottleneck (>=1.3.6)", "numba (>=0.56.4)", "numexpr (>=2.8.4)"]
+plot = ["matplotlib (>=3.6.3)"]
+postgresql = ["SQLAlchemy (>=2.0.0)", "adbc-driver-postgresql (>=0.8.0)", "psycopg2 (>=2.9.6)"]
+pyarrow = ["pyarrow (>=10.0.1)"]
+spss = ["pyreadstat (>=1.2.0)"]
+sql-other = ["SQLAlchemy (>=2.0.0)", "adbc-driver-postgresql (>=0.8.0)", "adbc-driver-sqlite (>=0.8.0)"]
+test = ["hypothesis (>=6.46.1)", "pytest (>=7.3.2)", "pytest-xdist (>=2.2.0)"]
+xml = ["lxml (>=4.9.2)"]
+
+[[package]]
+name = "pandas-schema"
+version = "0.3.6"
+description = "A validation library for Pandas data frames using user-friendly schemas"
+optional = false
+python-versions = "*"
+files = [
+    {file = "pandas_schema-0.3.6-py3-none-any.whl", hash = "sha256:7497621cdf8c191fca1ef6ded9caa6f2153b220f120a2686d921f80c8031994d"},
+    {file = "pandas_schema-0.3.6.tar.gz", hash = "sha256:c6bfc52c4bae9cdd7420fbe8c4b0622b769457827c3fc819928405638caf605f"},
+]
+
+[package.dependencies]
+numpy = "*"
+packaging = "*"
+pandas = ">=0.19"
+
+[[package]]
+name = "python-dateutil"
+version = "2.9.0.post0"
+description = "Extensions to the standard Python datetime module"
+optional = false
+python-versions = "!=3.0.*,!=3.1.*,!=3.2.*,>=2.7"
+files = [
+    {file = "python-dateutil-2.9.0.post0.tar.gz", hash = "sha256:37dd54208da7e1cd875388217d5e00ebd4179249f90fb72437e91a35459a0ad3"},
+    {file = "python_dateutil-2.9.0.post0-py2.py3-none-any.whl", hash = "sha256:a8b2bc7bffae282281c8140a97d3aa9c14da0b136dfe83f850eea9a5f7470427"},
+]
+
+[package.dependencies]
+six = ">=1.5"
+
+[[package]]
+name = "pytz"
+version = "2024.1"
+description = "World timezone definitions, modern and historical"
+optional = false
+python-versions = "*"
+files = [
+    {file = "pytz-2024.1-py2.py3-none-any.whl", hash = "sha256:328171f4e3623139da4983451950b28e95ac706e13f3f2630a879749e7a8b319"},
+    {file = "pytz-2024.1.tar.gz", hash = "sha256:2a29735ea9c18baf14b448846bde5a48030ed267578472d8955cd0e7443a9812"},
+]
+
+[[package]]
+name = "six"
+version = "1.16.0"
+description = "Python 2 and 3 compatibility utilities"
+optional = false
+python-versions = ">=2.7, !=3.0.*, !=3.1.*, !=3.2.*"
+files = [
+    {file = "six-1.16.0-py2.py3-none-any.whl", hash = "sha256:8abb2f1d86890a2dfb989f9a77cfcfd3e47c2a354b01111771326f8aa26e0254"},
+    {file = "six-1.16.0.tar.gz", hash = "sha256:1e61c37477a1626458e36f7b1d82aa5c9b094fa4802892072e49de9c60c4c926"},
+]
+
+[[package]]
+name = "tzdata"
+version = "2024.1"
+description = "Provider of IANA time zone data"
+optional = false
+python-versions = ">=2"
+files = [
+    {file = "tzdata-2024.1-py2.py3-none-any.whl", hash = "sha256:9068bc196136463f5245e51efda838afa15aaeca9903f49050dfa2679db4d252"},
+    {file = "tzdata-2024.1.tar.gz", hash = "sha256:2674120f8d891909751c38abcdfd386ac0a5a1127954fbc332af6b5ceae07efd"},
+]
+
+[metadata]
+lock-version = "2.0"
+python-versions = ">=3.10"
+content-hash = "a3e59d2093cbf1625eff71e74d9ffc6bbc19ad74bebc5f4c79fda4ad44aa9de1"
diff --git a/pgscatalog.validate/poetry.toml b/pgscatalog.validate/poetry.toml
new file mode 100644
index 0000000..384db5f
--- /dev/null
+++ b/pgscatalog.validate/poetry.toml
@@ -0,0 +1,3 @@
+[virtualenvs]
+create = true
+in-project = true
\ No newline at end of file
diff --git a/pgscatalog.validate/pyproject.toml b/pgscatalog.validate/pyproject.toml
new file mode 100644
index 0000000..76165fe
--- /dev/null
+++ b/pgscatalog.validate/pyproject.toml
@@ -0,0 +1,29 @@
+[tool.poetry]
+name = "pgscatalog.validate"
+version = "0.1"
+description = "Lorem ipsum"
+authors = [
+    "Benjamin Wingfield <bwingfield@ebi.ac.uk>",
+    "Samuel Lambert <sl925@medschl.cam.ac.uk>",
+    "Laurent Gil <lg10@sanger.ac.uk>",
+    "Florent Yvon <fy279@cam.ac.uk>"
+]
+readme = "README.md"
+packages = [
+    { include = "pgscatalog", from = "src" },
+]
+
+[tool.poetry.dependencies]
+python = ">=3.10"
+pandas = "^2.2.0"
+pandas-schema = "^0.3.6"
+
+[tool.poetry.scripts]
+pgscatalog-validate = 'pgscatalog.validate.cli.validate_scorefile:validate_scorefile'
+
+[build-system]
+requires = ["poetry-core>=1.0.0"]
+build-backend = "poetry.core.masonry.api"
+
+[tool.coverage.run]
+source = ['src/pgscatalog/validate']
\ No newline at end of file
diff --git a/pgscatalog.validate/src/pgscatalog/validate/__init__.py b/pgscatalog.validate/src/pgscatalog/validate/__init__.py
new file mode 100644
index 0000000..e69de29
diff --git a/pgscatalog.validate/src/pgscatalog/validate/cli/__init__.py b/pgscatalog.validate/src/pgscatalog/validate/cli/__init__.py
new file mode 100644
index 0000000..e69de29
diff --git a/pgscatalog.validate/src/pgscatalog/validate/cli/validate_scorefile.py b/pgscatalog.validate/src/pgscatalog/validate/cli/validate_scorefile.py
new file mode 100644
index 0000000..d1153e9
--- /dev/null
+++ b/pgscatalog.validate/src/pgscatalog/validate/cli/validate_scorefile.py
@@ -0,0 +1,171 @@
+import os, glob, re
+import argparse
+import logging
+import textwrap
+
+data_sum = {'valid': [], 'invalid': [], 'other': []}
+
+val_types = ('formatted', 'hm_pos')
+
+logging.basicConfig(level=logging.INFO, format='(%(levelname)s): %(message)s')
+
+
+def validate_scorefile() -> None:
+    global data_sum, score_dir
+    args = _parse_args()
+    _check_args(args)
+
+    # Check PGS Catalog file name nomenclature
+    check_filename = False
+    if args.check_filename:
+        check_filename = True
+    else:
+        print("WARNING: the parameter '--check_filename' is not present in the submitted command line, therefore the validation of the scoring file name(s) won't be performed.")
+
+    validator_type = args.t
+    files_dir = args.dir
+    log_dir = args.log_dir
+
+    ## Select validator class ##
+    if validator_type == 'formatted':
+        import pgscatalog.validate.lib.formatted.validator as validator_package
+    elif validator_type == 'hm_pos':
+        import pgscatalog.validate.lib.harmonized_position.validator as validator_package
+
+    ## Run validator ##
+    # One file
+    if args.f:
+        _run_validator(args.f,log_dir,score_dir,validator_package,check_filename,validator_type)
+    # Content of the directory
+    elif files_dir:
+        count_files = 0
+        # Browse directory: for each file run validator
+        for filepath in sorted(glob.glob(files_dir+"/*.*")):
+            _run_validator(filepath,log_dir,score_dir,validator_package,check_filename,validator_type)
+            count_files += 1
+
+        # Print summary  + results
+        print("\nSummary:")
+        if data_sum['valid']:
+            print(f"- Valid: {len(data_sum['valid'])}/{count_files}")
+        if data_sum['invalid']:
+            print(f"- Invalid: {len(data_sum['invalid'])}/{count_files}")
+        if data_sum['other']:
+            print(f"- Other issues: {len(data_sum['other'])}/{count_files}")
+
+        if data_sum['invalid']:
+            print("Invalid files:")
+            print("\n".join(data_sum['invalid']))
+
+
+def _read_last_line(file: str) -> str:
+    '''
+    Return the last line of the file
+    '''
+    fileHandle = open ( file,"r" )
+    lineList = fileHandle.readlines()
+    fileHandle.close()
+    return lineList[-1]
+
+
+def _file_validation_state(filename: str, log_file: str) -> None:
+    global data_sum
+    if os.path.exists(log_file):
+        log_result = _read_last_line(log_file)
+        if re.search("File is valid", log_result):
+            print("> valid\n")
+            data_sum['valid'].append(filename)
+        elif re.search("File is invalid", log_result):
+            print("#### invalid! ####\n")
+            data_sum['invalid'].append(filename)
+        else:#
+            print("!! validation process had an issue. Please look at the logs.\n")
+            data_sum['other'].append(filename)
+    else:
+        print("!! validation process had an issue: the log file can't be found")
+        data_sum['other'].append(filename)
+
+
+def _check_args(args: argparse.Namespace) -> None:
+    global score_dir
+
+    ## Check parameters ##
+    # Type of validator
+    if args.t not in val_types:
+        print(f"Error: Validator type (option -t) '{args.t}' is not in the list of recognized types: {val_types}.")
+        exit(1)
+    # Logs dir
+    if not os.path.isdir(args.log_dir):
+        print(f"Error: Log dir '{args.log_dir}' can't be found!")
+        exit(1)
+    # File and directory parameters (only one of the '-f' and '--dir' can be used)
+    if args.f and args.dir:
+        print("Error: you can't use both options [-f] - single scoring file and [--dir] - directory of scoring files. Please use only 1 of these 2 options!")
+        exit(1)
+    elif not args.f and not args.dir:
+        print("Error: you need to provide a scoring file [-f] or a directory of scoring files [--dir]!")
+        exit(1)
+    elif args.f and not os.path.isfile(args.f):
+        print(f"Error: Scoring file '{args.f}' can't be found!")
+        exit(1)
+    elif args.dir and not os.path.isdir(args.dir):
+        print(f"Error: the scoring file directory '{args.dir}' can't be found!")
+        exit(1)
+    # Scoring files directory (only to compare with the harmonized files)
+    score_dir = None
+    if args.score_dir:
+        score_dir = args.score_dir
+        if not os.path.isdir(score_dir):
+            print(f"Error: Scoring file directory '{score_dir}' can't be found!")
+            exit(1)
+    elif args.t != 'formatted':
+        print("WARNING: the parameter '--score_dir' is not present in the submitted command line, therefore the comparison of the number of data rows between the formatted scoring file(s) and the harmonized scoring file(s) won't be performed.")
+
+
+def _run_validator(filepath: str, log_dir: str, score_dir: str, validator_package: object, check_filename: bool, validator_type: str) -> None:
+    ''' Run the file validator '''
+    file = os.path.basename(filepath)
+    filename = file.split('.')[0]
+    print(f"# Filename: {file}")
+    log_file = f'{log_dir}/{filename}_log.txt'
+
+    # Run validator
+    validator = validator_package.init_validator(filepath,log_file,score_dir)
+    if check_filename:
+        validator.run_validator()
+    else:
+        validator.run_validator_skip_check_filename()
+
+    # Check log
+    _file_validation_state(file,log_file)
+
+
+def _description_text() -> str:
+    return textwrap.dedent('''\
+    Validate a set of scoring files to match the PGS Catalog scoring file formats.
+    It can validate:
+    - The formatted scoring file format (https://www.pgscatalog.org/downloads/#dl_ftp_scoring)
+    - The harmonized (Position) scoring file format (https://www.pgscatalog.org/downloads/#dl_ftp_scoring_hm_pos)
+   ''')
+
+
+def _epilog_text() -> str:
+    return textwrap.dedent(f'''\
+    You need to specify the type of file format to validate, using the paramter '-t' ({' or '.join(val_types)}).
+   ''')
+
+
+def _parse_args(args=None) -> argparse.Namespace:
+    parser = argparse.ArgumentParser(description=_description_text(), epilog=_epilog_text(),
+                                     formatter_class=argparse.RawDescriptionHelpFormatter)
+    parser.add_argument("-t", help=f"Type of validator: {' or '.join(val_types)}", metavar='VALIDATOR_TYPE')
+    parser.add_argument("-f", help='The path to the polygenic scoring file to be validated (no need to use the [--dir] option)', metavar='SCORING_FILE_NAME')
+    parser.add_argument('--dir', help='The name of the directory containing the files that need to processed (no need to use the [-f] option')
+    parser.add_argument('--score_dir', help='<Optional> The name of the directory containing the formatted scoring files to compare with harmonized scoring files')
+    parser.add_argument('--log_dir', help='The name of the log directory where the log file(s) will be stored', required=True)
+    parser.add_argument('--check_filename', help='<Optional> Check that the file name match the PGS Catalog nomenclature', required=False, action='store_true')
+    return parser.parse_args(args)
+
+
+if __name__ == '__main__':
+    validate_scorefile()
diff --git a/pgscatalog.validate/src/pgscatalog/validate/lib/__init__.py b/pgscatalog.validate/src/pgscatalog/validate/lib/__init__.py
new file mode 100644
index 0000000..e69de29
diff --git a/pgscatalog.validate/src/pgscatalog/validate/lib/common_constants.py b/pgscatalog.validate/src/pgscatalog/validate/lib/common_constants.py
new file mode 100644
index 0000000..768752a
--- /dev/null
+++ b/pgscatalog.validate/src/pgscatalog/validate/lib/common_constants.py
@@ -0,0 +1,44 @@
+SNP_DSET = 'rsID'
+CHR_DSET = 'chr_name'
+BP_DSET = 'chr_position'
+EFFECT_DSET = 'effect_allele'
+OTH_DSET = 'other_allele'
+EFFECT_WEIGHT_DSET = 'effect_weight'
+
+# Other columns
+LOCUS_DSET = 'locus_name'
+OR_DSET = 'OR'
+HR_DSET = 'HR'
+BETA_DSET = 'beta'
+FREQ_DSET = 'allelefrequency_effect'
+FLAG_INTERACTION_DSET = 'is_interaction'
+FLAG_RECESSIVE_DSET = 'is_recessive'
+FLAG_HAPLOTYPE_DSET = 'is_haplotype'
+FLAG_DIPLOTYPE_DSET = 'is_diplotype'
+METHOD_DSET = 'imputation_method'
+SNP_DESC_DSET = 'variant_description'
+INCLUSION_DSET = 'inclusion_criteria'
+DOSAGE_0_WEIGHT = 'dosage_0_weight'
+DOSAGE_1_WEIGHT = 'dosage_1_weight'
+DOSAGE_2_WEIGHT = 'dosage_2_weight'
+# hmPOS
+HM_SOURCE_DSET = 'hm_source'
+HM_SNP_DSET = 'hm_rsID'	
+HM_CHR_DSET = 'hm_chr'
+HM_BP_DSET = 'hm_pos'
+HM_OTH_DSET = 'hm_inferOtherAllele'
+HM_MATCH_CHR_DSET = 'hm_match_chr'
+HM_MATCH_BP_DSET = 'hm_match_pos'
+# hmFinal
+VARIANT_DSET = 'variant_id'
+HM_CODE_DSET = 'hm_code'
+HM_INFO_DSET = 'hm_info'
+
+
+DSET_TYPES = {SNP_DSET: str, CHR_DSET: str, BP_DSET: int, EFFECT_DSET: str, OTH_DSET: str,
+              EFFECT_WEIGHT_DSET: float, VARIANT_DSET: str, HM_CODE_DSET: int, HM_INFO_DSET: str, LOCUS_DSET: str, OR_DSET: float, HR_DSET: float, BETA_DSET: float, FREQ_DSET: float,
+              FLAG_INTERACTION_DSET: str, FLAG_RECESSIVE_DSET: str, FLAG_HAPLOTYPE_DSET: str, FLAG_DIPLOTYPE_DSET: str,
+              METHOD_DSET: str, SNP_DESC_DSET: str, INCLUSION_DSET: str, DOSAGE_0_WEIGHT: float, DOSAGE_1_WEIGHT: float, DOSAGE_2_WEIGHT: float,
+              HM_SOURCE_DSET:str, HM_SNP_DSET: str, HM_CHR_DSET: str, HM_BP_DSET: int, HM_OTH_DSET: str, HM_MATCH_CHR_DSET: str, HM_MATCH_BP_DSET: int} 
+
+TO_DISPLAY_ORDER = [ SNP_DSET, CHR_DSET, BP_DSET, EFFECT_DSET, OTH_DSET, EFFECT_WEIGHT_DSET, LOCUS_DSET, OR_DSET, HR_DSET, HM_CODE_DSET, HM_INFO_DSET, HM_SOURCE_DSET, HM_SNP_DSET, HM_BP_DSET, HM_OTH_DSET, HM_MATCH_CHR_DSET, HM_MATCH_BP_DSET]
\ No newline at end of file
diff --git a/pgscatalog.validate/src/pgscatalog/validate/lib/formatted/__init__.py b/pgscatalog.validate/src/pgscatalog/validate/lib/formatted/__init__.py
new file mode 100644
index 0000000..e69de29
diff --git a/pgscatalog.validate/src/pgscatalog/validate/lib/formatted/validator.py b/pgscatalog.validate/src/pgscatalog/validate/lib/formatted/validator.py
new file mode 100644
index 0000000..3a3bfc0
--- /dev/null
+++ b/pgscatalog.validate/src/pgscatalog/validate/lib/formatted/validator.py
@@ -0,0 +1,197 @@
+import gzip
+import re
+from pandas_schema import Schema
+from ..schemas import *
+from ..validator_base import *
+
+'''
+PGS Catalog Harmonized file validator
+- using pandas_schema https://github.com/TMiguelT/PandasSchema
+'''
+
+class ValidatorFormatted(ValidatorBase):
+
+    def __init__(self, file, score_dir=None, logfile="VALIDATE.log", error_limit=0):
+        super().__init__(file, score_dir, logfile, error_limit)
+        self.score_dir=None
+        self.meta_format = FORMATTED_META_GENERIC
+        self.schema_validators = FORMATTED_VALIDATORS
+        self.valid_cols = VALID_COLS_FORMATTED
+        self.valid_type = VALID_TYPE_FORMATTED
+        self.setup_field_validation()
+
+    
+    def extract_specific_metadata(self,line):
+        ''' Extract some of the metadata. '''
+        match_variants_number = re.search(r'#variants_number=(\d+)', line)
+        if match_variants_number:
+            self.variants_number = int(match_variants_number.group(1))
+
+
+    def get_and_check_variants_number(self):
+        ''' Verify that the number of variant lines corresponds to the number of variants in the headers '''
+        variant_lines = 0
+        
+        with gzip.open( self.file, 'rb') as f:
+            line_number = 0
+            for line in f:
+                line_number += 1
+                line = line.decode('utf-8').rstrip()
+                if line.startswith('#'):
+                    match_variants_number = re.search(r'#variants_number=(\d+)', line)
+                    if match_variants_number:
+                        self.variants_number = int(match_variants_number.group(1))
+                else:
+                    variant_lines += 1
+                    if re.search(r'\w+', line): # Line not empty
+                        cols = line.split(self.sep)
+                        has_trailing_spaces = self.check_leading_trailing_spaces(cols,line_number)
+                        if has_trailing_spaces:
+                            self.global_errors += 1
+                    else:
+                        self.logger.error(f'- Line {line_number} is empty')
+                        self.global_errors += 1
+                            
+        if self.variants_number:
+            variant_lines -= 1 # Remove the header line from the count
+            if self.variants_number != variant_lines:
+                self.logger.error(f'- The number of variants lines in the file ({variant_lines}) and the number of variants declared in the headers ({self.variants_number}) are different')
+                self.global_errors += 1
+        else:
+            self.logger.error("- Can't retrieve the number of variants from the headers")
+            self.global_errors += 1
+
+
+    def detect_duplicated_rows(self,dataframe_chunk):
+        ''' Detect duplicated rows in the scoring file. '''
+        # Columns of interest to compare the different rows
+        cols_sel = []
+        for col in ['rsID','chr_name','chr_position','effect_allele','other_allele']:
+            if col in self.cols_to_validate:
+                cols_sel.append(col)
+
+        duplicate_status = dataframe_chunk.duplicated(cols_sel)
+        if any(duplicate_status):
+            duplicated_rows = dataframe_chunk[duplicate_status]
+            self.logger.error(f'Duplicated row(s) found: {len(duplicated_rows.index)}\n\t-> {duplicated_rows.to_string(header=False,index=False)}')
+            self.global_errors += 1
+            for index in duplicated_rows.index:
+                self.bad_rows.append(index)
+
+
+    def validate_data(self) -> bool:
+        ''' Validate the file: data format and data content '''
+        self.logger.info("Validating data...")
+        if not self.open_file_and_check_for_squareness():
+            self.logger.error("Please fix the table. Some rows have different numbers of columns to the header")
+            self.logger.info("Rows with different numbers of columns to the header are not validated")
+        # Check the consitence between the declared variants number and the actual number of variants in the file
+        self.get_and_check_variants_number()
+
+        for chunk in self.df_iterator(self.file):
+            dataframe_to_validate = chunk[self.cols_to_read]
+            dataframe_to_validate.columns = self.cols_to_validate # sets the headers to standard format if neeeded
+
+            # Detect duplicated rows
+            self.detect_duplicated_rows(dataframe_to_validate)
+
+            # validate the snp column if present
+            if SNP_DSET in self.header:
+                sub_schema = FORMATTED_VALIDATORS_SNP
+                if CHR_DSET and BP_DSET in self.header:
+                    sub_schema = FORMATTED_VALIDATORS_SNP_EMPTY
+                self.validate_schema(sub_schema,dataframe_to_validate)
+
+            if CHR_DSET and BP_DSET in self.header:
+                self.validate_schema(FORMATTED_VALIDATORS_POS, dataframe_to_validate)
+
+            if OR_DSET in self.header:
+                self.validate_schema(FORMATTED_VALIDATORS_OR,dataframe_to_validate)
+
+            if HR_DSET in self.header:
+                self.validate_schema(FORMATTED_VALIDATORS_HR,dataframe_to_validate)
+
+            self.process_errors()
+            if len(self.bad_rows) >= self.error_limit:
+                break
+        if not self.bad_rows and not self.global_errors:
+            if self.is_file_valid():
+                self.logger.info("File is valid")
+            else:
+                self.logger.info("File is invalid")
+        else:
+            self.logger.info("File is invalid - {} bad rows, limit set to {}".format(len(self.bad_rows), self.error_limit))
+            self.set_file_is_invalid()
+        return self.is_file_valid()
+
+
+    def validate_filename(self) -> bool:
+        ''' Validate the file name structure. '''
+        self.logger.info("Validating file name...")
+        filename = self.file.split('/')[-1].split('.')[0]
+        is_valid_filename = True
+        if not re.match(r'^PGS\d{6}$', filename):
+            self.logger.info("Invalid filename: {}".format(self.file))
+            self.logger.error("Filename: {} should follow the pattern 'PGSXXXXXX.txt.gz', where the 'X' are the 6 digits of the PGS identifier (e.g. PGS000001)".format(filename))
+            is_valid_filename = False
+            self.set_file_is_invalid()
+
+        return is_valid_filename
+
+
+    def validate_headers(self) -> bool:
+        ''' Validate the list of column names. '''
+        self.logger.info("Validating headers...")
+        self.detect_genomebuild_with_rsid()
+        required_is_subset = set(STD_COLS_VAR_FORMATTED).issubset(self.header)
+        if not required_is_subset:
+            # check if everything but snp:
+            required_is_subset = set(CHR_COLS_VAR_FORMATTED).issubset(self.header)
+            if not required_is_subset:
+                required_is_subset = set(SNP_COLS_VAR_FORMATTED).issubset(self.header)
+            if not required_is_subset:
+                self.logger.error("Required headers: {} are not in the file header: {}".format(STD_COLS_VAR_FORMATTED, self.header))
+
+        # Check if at least one of the effect columns is there
+        has_effect_col = 0
+        for col in STD_COLS_EFFECT_FORMATTED:
+            if set([col]).issubset(self.header):
+                has_effect_col = 1
+                break
+        if not has_effect_col:
+            self.logger.error("Required headers: at least one of the columns '{}' must be in the file header: {}".format(STD_COLS_EFFECT_FORMATTED, self.header))
+            required_is_subset = None
+
+        if not required_is_subset:
+            self.logger.info("Invalid headers...exiting before any further checks")
+            self.set_file_is_invalid()
+
+        return required_is_subset
+
+
+    def detect_genomebuild_with_rsid(self):
+        ''' The column "rsID" should always be in the scoring file when the genome build is not reported (i.e. "NR") '''
+        self.get_genomebuild()
+        if self.genomebuild == 'NR':
+            if SNP_DSET not in self.header:
+                self.logger.error(f"- The combination: Genome Build = '{self.genomebuild}' & the missing column '{SNP_DSET}' in the header is not allowed as we have to manually guess the genome build.")
+                self.global_errors += 1
+
+
+    def get_genomebuild(self):
+        ''' Retrieve the Genome Build from the comments '''
+        with gzip.open(self.file, 'rb') as f_in:
+            for f_line in f_in:
+                line = f_line.decode()
+                # Update header
+                if line.startswith('#genome_build'):
+                    gb = (line.split('='))[1]
+                    self.genomebuild = gb.strip()
+                    return
+
+
+##################################################################
+
+def init_validator(file, logfile, score_dir=None) -> ValidatorFormatted:
+    validator = ValidatorFormatted(file=file, score_dir=score_dir, logfile=logfile)
+    return validator
\ No newline at end of file
diff --git a/pgscatalog.validate/src/pgscatalog/validate/lib/harmonized_position/__init__.py b/pgscatalog.validate/src/pgscatalog/validate/lib/harmonized_position/__init__.py
new file mode 100644
index 0000000..e69de29
diff --git a/pgscatalog.validate/src/pgscatalog/validate/lib/harmonized_position/validator.py b/pgscatalog.validate/src/pgscatalog/validate/lib/harmonized_position/validator.py
new file mode 100644
index 0000000..62d3348
--- /dev/null
+++ b/pgscatalog.validate/src/pgscatalog/validate/lib/harmonized_position/validator.py
@@ -0,0 +1,98 @@
+import re
+from ..schemas import *
+from ..validator_base import *
+
+'''
+PGS Catalog Harmonized file validator
+- using pandas_schema https://github.com/TMiguelT/PandasSchema
+'''
+
+class ValidatorPos(ValidatorBase):
+    ''' Validator for the HmPOS Harmonized file format. '''
+
+    def __init__(self, file, score_dir=None, logfile="VALIDATE.log", error_limit=0):
+        super().__init__(file, score_dir, logfile, error_limit)
+        self.meta_format = HM_META_POS
+        self.schema_validators = POS_VALIDATORS
+        self.valid_cols = VALID_COLS_POS
+        self.valid_type = VALID_TYPE_POS
+        self.setup_field_validation()
+
+
+    def extract_specific_metadata(self,line):
+        ''' Extract some of the metadata. '''
+        match_variants_number = re.search(r'#variants_number=(\d+)', line)
+        if match_variants_number:
+            self.variants_number = int(match_variants_number.group(1))
+
+
+    def validate_line_content(self,cols_content,var_line_number):
+        ''' Populate the abstract method from ValidatorBase, to check some data in esch row. '''
+        # Check lines
+        line_dict = dict(zip(self.header, cols_content))
+        line_cols = line_dict.keys()
+        # Check each chromosome data is consistent
+        chr_cols = ['chr_name', 'hm_chr', 'hm_match_chr']
+        if all(col_name in line_cols for col_name in chr_cols):
+            if line_dict['chr_name'] == line_dict['hm_chr'] and line_dict['hm_match_chr'] != 'True':
+                self.logger.error(f"- Variant line {var_line_number} | 'hm_match_chr' should be 'True': same chromosome ('chr_name={line_dict['chr_name']}' vs 'hm_chr={line_dict['hm_chr']}')")
+        # Check each position data is consistent
+        pos_cols = ['chr_position', 'hm_pos', 'hm_match_pos']
+        if all(col_name in line_cols for col_name in pos_cols):
+            if line_dict['chr_position'] == line_dict['hm_pos'] and line_dict['hm_match_pos'] != 'True':
+                self.logger.error(f"- Variant line {var_line_number} | 'hm_match_pos' should be 'True': same position ('chr_position={line_dict['chr_position']}' vs 'hm_pos={line_dict['hm_pos']}')")
+
+
+    def validate_filename(self) -> bool:
+        ''' Validate the file name structure. '''
+        self.logger.info("Validating file name...")
+        pgs_id, build = None, None
+        is_valid_filename = True
+        # hmPOS
+        filename = self.file.split('/')[-1].split('.')[0]
+        filename_parts = filename.split('_hmPOS_')
+        if len(filename_parts) != 2:
+            self.logger.error("Filename: {} should follow the pattern <pgs_id>_hmPOS_<build>.txt.gz [build=GRChXX]".format(filename))
+            self.set_file_is_invalid()
+            is_valid_filename = False
+        else:
+            pgs_id, build = filename_parts
+        self.file_pgs_id = pgs_id
+        self.file_genomebuild = build
+        if not self.check_build_is_legit(build):
+            self.logger.error("Build: {} is not an accepted build value".format(build))
+            self.set_file_is_invalid()
+            is_valid_filename = False
+
+        return is_valid_filename
+
+
+    def validate_headers(self) -> bool:
+        ''' Validate the list of column names. '''
+        self.logger.info("Validating headers...")
+        # Check if it has at least a "SNP" column or a "chromosome" column
+        required_is_subset = set(STD_COLS_VAR_POS).issubset(self.header)
+        if not required_is_subset:
+            self.logger.error("Required headers: {} are not in the file header: {}".format(STD_COLS_VAR_POS, self.header))
+       
+        # Check if it has at least a "SNP" column or a "chromosome" column
+        required_pos = set(SNP_COLS_VAR_POS).issubset(self.header)
+        if not required_pos:
+            # check if everything but snp:
+            required_pos = set(CHR_COLS_VAR_POS).issubset(self.header)
+            if not required_pos:
+                self.logger.error("One of the following required header is missing: '{}' and/or '{}' are not in the file header: {}".format(SNP_COLS_VAR_POS, CHR_COLS_VAR_POS, self.header))
+                required_is_subset = required_pos
+
+        if not required_is_subset:
+            self.logger.info("Invalid headers...exiting before any further checks")
+            self.set_file_is_invalid()
+
+        return required_is_subset
+
+
+##################################################################
+
+def init_validator(file, logfile, score_dir=None) -> ValidatorPos:
+    validator = ValidatorPos(file=file, score_dir=score_dir, logfile=logfile)
+    return validator
\ No newline at end of file
diff --git a/pgscatalog.validate/src/pgscatalog/validate/lib/helpers.py b/pgscatalog.validate/src/pgscatalog/validate/lib/helpers.py
new file mode 100644
index 0000000..7d786e5
--- /dev/null
+++ b/pgscatalog.validate/src/pgscatalog/validate/lib/helpers.py
@@ -0,0 +1,29 @@
+import math
+import pandas as pd
+from pandas_schema.validation import _SeriesValidation
+
+
+class InInclusiveRangeValidation(_SeriesValidation):
+    """
+    Checks that each element in the series is within a given inclusive numerical range.
+    Doesn't care if the values are not numeric - it will try anyway.
+    """
+    def __init__(self, min: float = -math.inf, max: float = math.inf, **kwargs):
+        """
+        :param min: The minimum (inclusive) value to accept
+        :param max: The maximum (inclusive) value to accept
+        """
+        self.min = min
+        self.max = max
+        super().__init__(**kwargs)
+
+    @property
+    def default_message(self):
+        return 'was not in the range [{}, {})'.format(self.min, self.max)
+
+    def validate(self, series: pd.Series) -> pd.Series:
+        series = pd.to_numeric(series, errors='coerce')
+        return (series >= self.min) & (series <= self.max)
+
+
+
diff --git a/pgscatalog.validate/src/pgscatalog/validate/lib/schemas.py b/pgscatalog.validate/src/pgscatalog/validate/lib/schemas.py
new file mode 100644
index 0000000..29d7e67
--- /dev/null
+++ b/pgscatalog.validate/src/pgscatalog/validate/lib/schemas.py
@@ -0,0 +1,157 @@
+import numpy as np
+from pandas_schema import Column
+from pandas_schema.validation import MatchesPatternValidation, InListValidation, CanConvertValidation, LeadingWhitespaceValidation, TrailingWhitespaceValidation, CustomElementValidation
+from .helpers import InInclusiveRangeValidation
+from .common_constants import *
+
+
+#### Validation types ####
+
+VALID_TYPE_FORMATTED = 'formatted'
+VALID_TYPE_POS = 'hm_pos'
+
+
+#### Columns ####
+
+# Formatted scoring files
+STD_COLS_VAR_FORMATTED = (EFFECT_DSET, CHR_DSET, BP_DSET, SNP_DSET) #OR_DSET, RANGE_L_DSET, RANGE_U_DSET, BETA_DSET, SE_DSET, FREQ_DSET , EFFECT_DSET, OTH_DSET)
+
+SNP_COLS_VAR_FORMATTED = (EFFECT_DSET, CHR_DSET, BP_DSET)
+CHR_COLS_VAR_FORMATTED = (EFFECT_DSET, SNP_DSET)
+
+STD_COLS_EFFECT_FORMATTED = (EFFECT_WEIGHT_DSET,OR_DSET,HR_DSET)
+
+VALID_COLS_FORMATTED = (EFFECT_WEIGHT_DSET, OR_DSET, HR_DSET, BETA_DSET, FREQ_DSET, LOCUS_DSET, EFFECT_DSET, OTH_DSET, CHR_DSET, BP_DSET, SNP_DSET)
+
+# Harmonized scoring files - POS
+STD_COLS_VAR_POS = (HM_SOURCE_DSET, HM_CHR_DSET, HM_BP_DSET)
+
+SNP_COLS_VAR_POS = (SNP_DSET, HM_SNP_DSET)
+CHR_COLS_VAR_POS = (CHR_DSET,)
+
+VALID_COLS_POS = (HM_SOURCE_DSET, HM_SNP_DSET, HM_CHR_DSET, HM_BP_DSET, HM_OTH_DSET, HM_MATCH_CHR_DSET, HM_MATCH_BP_DSET)
+
+# Harmonized scoring files - Final
+STD_COLS_VAR_FINAL = (EFFECT_DSET, EFFECT_WEIGHT_DSET, HM_CODE_DSET, HM_INFO_DSET)
+
+SNP_COLS_VAR_FINAL = (VARIANT_DSET,)
+CHR_COLS_VAR_FINAL = (CHR_DSET, HM_CHR_DSET)
+
+VALID_COLS_FINAL = (SNP_DSET, CHR_DSET, BP_DSET, EFFECT_DSET, OTH_DSET, EFFECT_WEIGHT_DSET, LOCUS_DSET, HM_CODE_DSET, HM_SNP_DSET, HM_CHR_DSET, HM_BP_DSET, HM_OTH_DSET, HM_MATCH_CHR_DSET, HM_MATCH_BP_DSET)
+
+
+#### Global variables ####
+
+VALID_CHROMOSOMES = ['1', '2', '3', '4', '5', '6', '7', '8',
+                     '9', '10', '11', '12', '13', '14', '15', '16',
+                     '17', '18', '19', '20', '21', '22',
+                     'X', 'x', 'Y', 'y', 'XY', 'xy', 'MT', 'Mt', 'mt']
+
+VALID_FILE_EXTENSIONS = [".txt", ".txt.gz"]
+
+# For the harmonized files
+VALID_SOURCES = ['ENSEMBL','Author-reported']
+# VALID_CODES = ['5','4','3','1','0','-1','-4','-5']
+BUILD_LIST = ['GRCh37','GRCh38']
+
+
+error_msg = 'this column cannot be null/empty' 
+null_validation = CustomElementValidation(lambda d: d is not np.nan and d != '', error_msg)
+
+
+#### Validators ####
+
+# Generic/shared validators
+GENERIC_VALIDATORS = {
+    CHR_DSET: Column(CHR_DSET, [InListValidation(VALID_CHROMOSOMES)], allow_empty=True),
+    BP_DSET: Column(BP_DSET, [CanConvertValidation(DSET_TYPES[BP_DSET]), InInclusiveRangeValidation(1, 999999999)], allow_empty=True),
+    EFFECT_WEIGHT_DSET: Column(EFFECT_WEIGHT_DSET, [CanConvertValidation(DSET_TYPES[EFFECT_WEIGHT_DSET]), null_validation], allow_empty=False),
+    EFFECT_DSET: Column(EFFECT_DSET, [MatchesPatternValidation(r'^[ACTGN\-]+$')], allow_empty=False),
+    OTH_DSET: Column(OTH_DSET, [MatchesPatternValidation(r'^[ACTGN\-]+$')], allow_empty=True),
+    LOCUS_DSET: Column(LOCUS_DSET, [CanConvertValidation(DSET_TYPES[LOCUS_DSET]), LeadingWhitespaceValidation(), TrailingWhitespaceValidation(), null_validation], allow_empty=True)
+}
+
+# Formatted validators
+FORMATTED_VALIDATORS = {k:v for k,v in GENERIC_VALIDATORS.items()}
+FORMATTED_VALIDATORS[SNP_DSET] = Column(SNP_DSET, [CanConvertValidation(DSET_TYPES[SNP_DSET]), MatchesPatternValidation(r'^(\.|(rs|HLA\-\w+\*)[0-9]+)$')], allow_empty=True)
+FORMATTED_VALIDATORS[OR_DSET] = Column(OR_DSET, [CanConvertValidation(DSET_TYPES[OR_DSET]), null_validation], allow_empty=True)
+FORMATTED_VALIDATORS[HR_DSET] = Column(HR_DSET, [CanConvertValidation(DSET_TYPES[HR_DSET]), null_validation], allow_empty=True)
+FORMATTED_VALIDATORS[BETA_DSET] = Column(BETA_DSET, [CanConvertValidation(DSET_TYPES[BETA_DSET]), null_validation], allow_empty=True)
+FORMATTED_VALIDATORS[FREQ_DSET] = Column(FREQ_DSET, [CanConvertValidation(DSET_TYPES[FREQ_DSET]), null_validation], allow_empty=True)
+FORMATTED_VALIDATORS[DOSAGE_0_WEIGHT] = Column(DOSAGE_0_WEIGHT, [CanConvertValidation(DSET_TYPES[DOSAGE_0_WEIGHT]), null_validation], allow_empty=True)
+FORMATTED_VALIDATORS[DOSAGE_1_WEIGHT] = Column(DOSAGE_1_WEIGHT, [CanConvertValidation(DSET_TYPES[DOSAGE_1_WEIGHT]), null_validation], allow_empty=True)
+FORMATTED_VALIDATORS[DOSAGE_2_WEIGHT] = Column(DOSAGE_2_WEIGHT, [CanConvertValidation(DSET_TYPES[DOSAGE_2_WEIGHT]), null_validation], allow_empty=True)
+
+FORMATTED_VALIDATORS_SNP = {k:v for k,v in FORMATTED_VALIDATORS.items()}
+FORMATTED_VALIDATORS_SNP[SNP_DSET] = Column(SNP_DSET, [CanConvertValidation(DSET_TYPES[SNP_DSET]), MatchesPatternValidation(r'^(\.|(rs|HLA\-\w+\*)[0-9]+)$')], allow_empty=False)
+
+FORMATTED_VALIDATORS_SNP_EMPTY = {k:v for k,v in FORMATTED_VALIDATORS.items()}
+FORMATTED_VALIDATORS_SNP_EMPTY[SNP_DSET] = Column(SNP_DSET, [CanConvertValidation(DSET_TYPES[SNP_DSET]), MatchesPatternValidation(r'^(rs[0-9]+|HLA\-\w+\*[0-9]+|nan|\.)$')], allow_empty=False)
+FORMATTED_VALIDATORS_SNP_EMPTY[CHR_DSET] = Column(CHR_DSET, [InListValidation(VALID_CHROMOSOMES)], allow_empty=False)
+FORMATTED_VALIDATORS_SNP_EMPTY[BP_DSET]  = Column(BP_DSET, [CanConvertValidation(DSET_TYPES[BP_DSET]), InInclusiveRangeValidation(1, 999999999)], allow_empty=False)
+
+FORMATTED_VALIDATORS_POS = {k:v for k,v in FORMATTED_VALIDATORS.items()}
+FORMATTED_VALIDATORS_POS[CHR_DSET] = Column(CHR_DSET, [InListValidation(VALID_CHROMOSOMES)], allow_empty=False)
+FORMATTED_VALIDATORS_POS[BP_DSET]  = Column(BP_DSET, [CanConvertValidation(DSET_TYPES[BP_DSET]), InInclusiveRangeValidation(1, 999999999)], allow_empty=False)
+
+FORMATTED_VALIDATORS_OR = {k:v for k,v in FORMATTED_VALIDATORS.items()}
+FORMATTED_VALIDATORS_OR[OR_DSET] = Column(OR_DSET, [CanConvertValidation(DSET_TYPES[OR_DSET])], allow_empty=False)
+
+FORMATTED_VALIDATORS_HR = {k:v for k,v in FORMATTED_VALIDATORS.items()}
+FORMATTED_VALIDATORS_HR[HR_DSET] = Column(HR_DSET, [CanConvertValidation(DSET_TYPES[HR_DSET])], allow_empty=False)
+
+# Position validators
+POS_VALIDATORS = {}
+POS_VALIDATORS[HR_DSET] = Column(HR_DSET, [CanConvertValidation(DSET_TYPES[HR_DSET]), null_validation], allow_empty=True)
+POS_VALIDATORS[HM_SOURCE_DSET] = Column(HM_SOURCE_DSET, [CanConvertValidation(DSET_TYPES[HM_SOURCE_DSET]), InListValidation(VALID_SOURCES), LeadingWhitespaceValidation(), TrailingWhitespaceValidation(), null_validation], allow_empty=False)
+POS_VALIDATORS[HM_SNP_DSET] = Column(HM_SNP_DSET, [CanConvertValidation(DSET_TYPES[HM_SNP_DSET]), MatchesPatternValidation(r'^(rs|HLA\-\w+\*)[0-9]+$')], allow_empty=True)
+POS_VALIDATORS[HM_CHR_DSET] = Column(HM_CHR_DSET, [InListValidation(VALID_CHROMOSOMES)], allow_empty=True)
+POS_VALIDATORS[HM_BP_DSET] = Column(HM_BP_DSET, [CanConvertValidation(DSET_TYPES[HM_BP_DSET]), InInclusiveRangeValidation(1, 999999999)], allow_empty=True)
+POS_VALIDATORS[HM_OTH_DSET] =  Column(HM_OTH_DSET, [MatchesPatternValidation(r'^[ACTGN\-\/]+$')], allow_empty=True)
+POS_VALIDATORS[HM_MATCH_CHR_DSET] = Column(HM_MATCH_CHR_DSET, [InListValidation(['True', 'False'])], allow_empty=True)
+POS_VALIDATORS[HM_MATCH_BP_DSET] = Column(HM_MATCH_BP_DSET, [InListValidation(['True', 'False'])], allow_empty=True)
+
+# Final validator
+# FINAL_VALIDATORS = {k:v for k,v in GENERIC_VALIDATORS.items()}
+# FINAL_VALIDATORS[EFFECT_DSET] = Column(EFFECT_DSET, [MatchesPatternValidation(r'^[ACTGN\-]+$')], allow_empty=True)
+# FINAL_VALIDATORS[OTH_DSET] = Column(OTH_DSET, [MatchesPatternValidation(r'^[ACTGN\-\.]+$')], allow_empty=True)
+# FINAL_VALIDATORS[VARIANT_DSET] = Column(VARIANT_DSET, [CanConvertValidation(DSET_TYPES[VARIANT_DSET]), MatchesPatternValidation(r'^((rs|HLA\-\w+\*)[0-9]+|\.)$')], allow_empty=True)
+# FINAL_VALIDATORS[HM_CODE_DSET] = Column(HM_CODE_DSET, [InListValidation(VALID_CODES), null_validation], allow_empty=True)
+# FINAL_VALIDATORS[HM_INFO_DSET] = Column(HM_INFO_DSET, [CanConvertValidation(DSET_TYPES[HM_INFO_DSET]), null_validation], allow_empty=True)
+
+
+#### Metadata entries ####
+
+FORMATTED_META_GENERIC = [
+    '###PGS CATALOG SCORING FILE',
+    '#format_version',
+    '##POLYGENIC SCORE',
+    '#pgs_id',
+    '#pgs_name',
+    '#trait_reported',
+    '#trait_mapped',
+    '#trait_efo',
+    '#genome_build',
+    '#variants_number',
+    '#weight_type',
+    '##SOURCE INFORMATION',
+    '#pgp_id',
+    '#citation'
+]
+
+HM_META_GENERIC = [ x for x in FORMATTED_META_GENERIC ]
+HM_META_GENERIC.append('##HARMONIZATION DETAILS')
+
+HM_META_POS = [ x for x in HM_META_GENERIC ]
+HM_META_POS.append('#HmPOS_build')
+HM_META_POS.append('#HmPOS_date')
+HM_META_POS.append('#HmPOS_match_chr')
+HM_META_POS.append('#HmPOS_match_pos')
+
+# HM_META_FINAL = [ x for x in HM_META_GENERIC ]
+# HM_META_FINAL.append('#Hm_file_version')
+# HM_META_FINAL.append('#Hm_genome_build')
+# HM_META_FINAL.append('#Hm_reference_source')
+# HM_META_FINAL.append('#Hm_creation_date')
+# HM_META_FINAL.append('#Hm_variants_number_matched')
+# HM_META_FINAL.append('#Hm_variants_number_unmapped')
\ No newline at end of file
diff --git a/pgscatalog.validate/src/pgscatalog/validate/lib/validate_scorefile.py b/pgscatalog.validate/src/pgscatalog/validate/lib/validate_scorefile.py
new file mode 100644
index 0000000..80294c3
--- /dev/null
+++ b/pgscatalog.validate/src/pgscatalog/validate/lib/validate_scorefile.py
@@ -0,0 +1,171 @@
+import os, glob, re
+import argparse
+import logging
+import textwrap
+
+data_sum = {'valid': [], 'invalid': [], 'other': []}
+
+val_types = ('formatted', 'hm_pos')
+
+logging.basicConfig(level=logging.INFO, format='(%(levelname)s): %(message)s')
+
+
+def validate_scorefile() -> None:
+    global data_sum, score_dir
+    args = _parse_args()
+    _check_args(args)
+
+    # Check PGS Catalog file name nomenclature
+    check_filename = False
+    if args.check_filename:
+        check_filename = True
+    else:
+        print("WARNING: the parameter '--check_filename' is not present in the submitted command line, therefore the validation of the scoring file name(s) won't be performed.")
+
+    validator_type = args.t
+    files_dir = args.dir
+    log_dir = args.log_dir
+
+    ## Select validator class ##
+    if validator_type == 'formatted':
+        import pgscatalog_utils.validate.formatted.validator as validator_package
+    elif validator_type == 'hm_pos':
+        import pgscatalog_utils.validate.harmonized_position.validator as validator_package
+
+    ## Run validator ##
+    # One file
+    if args.f:
+        _run_validator(args.f,log_dir,score_dir,validator_package,check_filename,validator_type)
+    # Content of the directory
+    elif files_dir:
+        count_files = 0
+        # Browse directory: for each file run validator
+        for filepath in sorted(glob.glob(files_dir+"/*.*")):
+            _run_validator(filepath,log_dir,score_dir,validator_package,check_filename,validator_type)
+            count_files += 1
+
+        # Print summary  + results
+        print("\nSummary:")
+        if data_sum['valid']:
+            print(f"- Valid: {len(data_sum['valid'])}/{count_files}")
+        if data_sum['invalid']:
+            print(f"- Invalid: {len(data_sum['invalid'])}/{count_files}")
+        if data_sum['other']:
+            print(f"- Other issues: {len(data_sum['other'])}/{count_files}")
+
+        if data_sum['invalid']:
+            print("Invalid files:")
+            print("\n".join(data_sum['invalid']))
+
+
+def _read_last_line(file: str) -> str:
+    '''
+    Return the last line of the file
+    '''
+    fileHandle = open ( file,"r" )
+    lineList = fileHandle.readlines()
+    fileHandle.close()
+    return lineList[-1]
+
+
+def _file_validation_state(filename: str, log_file: str) -> None:
+    global data_sum
+    if os.path.exists(log_file):
+        log_result = _read_last_line(log_file)
+        if re.search("File is valid", log_result):
+            print("> valid\n")
+            data_sum['valid'].append(filename)
+        elif re.search("File is invalid", log_result):
+            print("#### invalid! ####\n")
+            data_sum['invalid'].append(filename)
+        else:#
+            print("!! validation process had an issue. Please look at the logs.\n")
+            data_sum['other'].append(filename)
+    else:
+        print("!! validation process had an issue: the log file can't be found")
+        data_sum['other'].append(filename)
+
+
+def _check_args(args: argparse.Namespace) -> None:
+    global score_dir
+
+    ## Check parameters ##
+    # Type of validator
+    if args.t not in val_types:
+        print(f"Error: Validator type (option -t) '{args.t}' is not in the list of recognized types: {val_types}.")
+        exit(1)
+    # Logs dir
+    if not os.path.isdir(args.log_dir):
+        print(f"Error: Log dir '{args.log_dir}' can't be found!")
+        exit(1)
+    # File and directory parameters (only one of the '-f' and '--dir' can be used)
+    if args.f and args.dir:
+        print("Error: you can't use both options [-f] - single scoring file and [--dir] - directory of scoring files. Please use only 1 of these 2 options!")
+        exit(1)
+    elif not args.f and not args.dir:
+        print("Error: you need to provide a scoring file [-f] or a directory of scoring files [--dir]!")
+        exit(1)
+    elif args.f and not os.path.isfile(args.f):
+        print(f"Error: Scoring file '{args.f}' can't be found!")
+        exit(1)
+    elif args.dir and not os.path.isdir(args.dir):
+        print(f"Error: the scoring file directory '{args.dir}' can't be found!")
+        exit(1)
+    # Scoring files directory (only to compare with the harmonized files)
+    score_dir = None
+    if args.score_dir:
+        score_dir = args.score_dir
+        if not os.path.isdir(score_dir):
+            print(f"Error: Scoring file directory '{score_dir}' can't be found!")
+            exit(1)
+    elif args.t != 'formatted':
+        print("WARNING: the parameter '--score_dir' is not present in the submitted command line, therefore the comparison of the number of data rows between the formatted scoring file(s) and the harmonized scoring file(s) won't be performed.")
+
+
+def _run_validator(filepath: str, log_dir: str, score_dir: str, validator_package: object, check_filename: bool, validator_type: str) -> None:
+    ''' Run the file validator '''
+    file = os.path.basename(filepath)
+    filename = file.split('.')[0]
+    print(f"# Filename: {file}")
+    log_file = f'{log_dir}/{filename}_log.txt'
+
+    # Run validator
+    validator = validator_package.init_validator(filepath,log_file,score_dir)
+    if check_filename:
+        validator.run_validator()
+    else:
+        validator.run_validator_skip_check_filename()
+
+    # Check log
+    _file_validation_state(file,log_file)
+
+
+def _description_text() -> str:
+    return textwrap.dedent('''\
+    Validate a set of scoring files to match the PGS Catalog scoring file formats.
+    It can validate:
+    - The formatted scoring file format (https://www.pgscatalog.org/downloads/#dl_ftp_scoring)
+    - The harmonized (Position) scoring file format (https://www.pgscatalog.org/downloads/#dl_ftp_scoring_hm_pos)
+   ''')
+
+
+def _epilog_text() -> str:
+    return textwrap.dedent(f'''\
+    You need to specify the type of file format to validate, using the paramter '-t' ({' or '.join(val_types)}).
+   ''')
+
+
+def _parse_args(args=None) -> argparse.Namespace:
+    parser = argparse.ArgumentParser(description=_description_text(), epilog=_epilog_text(),
+                                     formatter_class=argparse.RawDescriptionHelpFormatter)
+    parser.add_argument("-t", help=f"Type of validator: {' or '.join(val_types)}", metavar='VALIDATOR_TYPE')
+    parser.add_argument("-f", help='The path to the polygenic scoring file to be validated (no need to use the [--dir] option)', metavar='SCORING_FILE_NAME')
+    parser.add_argument('--dir', help='The name of the directory containing the files that need to processed (no need to use the [-f] option')
+    parser.add_argument('--score_dir', help='<Optional> The name of the directory containing the formatted scoring files to compare with harmonized scoring files')
+    parser.add_argument('--log_dir', help='The name of the log directory where the log file(s) will be stored', required=True)
+    parser.add_argument('--check_filename', help='<Optional> Check that the file name match the PGS Catalog nomenclature', required=False, action='store_true')
+    return parser.parse_args(args)
+
+
+if __name__ == '__main__':
+    validate_scorefile()
diff --git a/pgscatalog.validate/src/pgscatalog/validate/lib/validator_base.py b/pgscatalog.validate/src/pgscatalog/validate/lib/validator_base.py
new file mode 100644
index 0000000..6d9173d
--- /dev/null
+++ b/pgscatalog.validate/src/pgscatalog/validate/lib/validator_base.py
@@ -0,0 +1,429 @@
+import os, sys, gc
+import gzip
+import csv
+import pathlib
+import logging
+import re
+from typing import List
+import pandas as pd
+import pandas_schema
+import warnings
+from .schemas import *
+
+'''
+PGS Catalog file validator
+- using pandas_schema https://github.com/TMiguelT/PandasSchema
+'''
+
+
+csv.field_size_limit(sys.maxsize)
+
+class ValidatorBase:
+
+    valid_extensions = VALID_FILE_EXTENSIONS
+    schema_validators = GENERIC_VALIDATORS
+    valid_cols = []
+    valid_type = ''
+    sep = '\t'
+
+    def __init__(self, file, score_dir=None, logfile="VALIDATE.log", error_limit=0):
+        self.file = file
+        self.score_dir = score_dir
+        self.schema = None
+        self.header = []
+        self.genomebuild = None
+        self.comment_lines_count = 1 # Counting the header line
+        self.cols_to_validate = []
+        self.cols_to_read = []
+        self.bad_rows = []
+        self.row_errors = []
+        self.errors_seen = {}
+        self.logfile = logfile
+        self.error_limit = int(error_limit)
+        self.is_valid = True
+
+        # Logging variables
+        self.logger = logging.getLogger(__name__)
+        self.handler = logging.FileHandler(self.logfile, 'w+')
+        self.handler.setLevel(logging.INFO)
+        self.logger.addHandler(self.handler)
+        self.logger.propagate = False
+
+        self.global_errors = 0
+        self.variants_number = 0
+
+
+    def validate_schema(self, schema: dict, dataframe_to_validate: pd.core.frame.DataFrame):
+        '''
+        Run the pandas_schema validation using the provided Schema and DataFrame
+        '''
+        self.schema = pandas_schema.Schema([schema[h] for h in self.cols_to_validate])
+        with warnings.catch_warnings():
+            # Ignore python warningd raised in the pandas_schema code
+            warnings.simplefilter('ignore', UserWarning)
+            errors = self.schema.validate(dataframe_to_validate)
+            self.store_errors(errors)
+
+
+    def setup_field_validation(self):
+        '''
+        Fetch the header and build the list of column to check/validate
+        '''
+        self.header = self.get_header()
+        self.cols_to_validate = [h for h in self.header if h in self.valid_cols]
+        self.cols_to_read = [h for h in self.header if h in self.valid_cols]
+
+
+    def get_header(self):
+        '''
+        Fetch the header (i.e. column names) information from the harmonized scoring file and store the list in a variable
+        '''
+        first_row = pd.read_csv(self.file, sep=self.sep, comment='#', nrows=1, index_col=False)
+        # Check if the column headers have leading and/or trailing spaces
+        # The leading/trailing spaces should raise an error during the header validation
+        has_trailing_spaces = self.check_leading_trailing_spaces(first_row.columns.values)
+        if has_trailing_spaces:
+            self.global_errors += 1
+        return first_row.columns.values
+
+
+    def get_genomebuild(self):
+        ''' Retrieve the Genome Build from the comments '''
+        if self.valid_type == 'hm_pos':
+            self.genomebuild = self.get_comments_info('#HmPOS_build')
+        else:
+            self.genomebuild = self.get_comments_info('#Hm_genome_build')
+
+
+    def get_pgs_id(self):
+        ''' Retrieve the PGS ID from the comments '''
+        self.pgs_id = self.get_comments_info('#pgs_id')
+
+
+    def validate_content(self):
+        ''' Validate the file content and verify that the number of variant lines corresponds to the number of variants in the headers '''
+        variant_lines_count = 0
+        meta_lines_count = 0
+        
+        with gzip.open( self.file, 'rb') as f:
+            line_number = 0
+            file_meta = []
+            for line in f:
+                line_number += 1
+                line = line.decode('utf-8').rstrip()
+                # Check Metadata
+                if line.startswith('#'):
+                    self.extract_specific_metadata(line)
+                    # Check that we have all the meta information
+                    for meta in self.meta_format:
+                        if line.startswith(meta):
+                            file_meta.append(meta)
+                            meta_lines_count += 1
+                            break
+                
+                # Check data
+                else:
+                    variant_lines_count += 1
+                    if re.search(r'\w+', line): # Line not empty
+                        cols_content = line.split(self.sep)
+                        has_trailing_spaces = self.check_leading_trailing_spaces(cols_content,line_number)
+                        if has_trailing_spaces:
+                            self.global_errors += 1
+                        
+                        if line.startswith('rsID') or line.startswith('chr_name'):
+                            continue
+                        
+                        self.validate_line_content(cols_content,variant_lines_count)
+                    else:
+                        self.logger.error(f'- Line {line_number} is empty')
+                        self.global_errors += 1
+        
+        # Compare the number of metadata lines: read vs expected
+        if meta_lines_count != len(self.meta_format):
+            self.logger.error(f'- The number of metadata lines [i.e. starting with the "#" character] in the file ({meta_lines_count}) and the expected number of metadata lines ({len(self.meta_format)}) are different')
+            diff_list = list(set(self.meta_format).difference(file_meta))
+            self.logger.error(f"  > Missing metadata line(s): {', '.join(diff_list)}")
+            self.global_errors += 1
+
+
+    def validate_data(self) -> bool:
+        ''' Validate the file: data format and data content '''
+        self.logger.info("Validating data...")
+        if not self.open_file_and_check_for_squareness():
+            self.logger.error("Please fix the table. Some rows have different numbers of columns to the header")
+            self.logger.info("Rows with different numbers of columns to the header are not validated")
+
+        # Validate data content and check the consitence between the declared variants number and the actual number of variants in the file
+        self.validate_content()
+        for chunk in self.df_iterator(self.file):
+            dataframe_to_validate = chunk[self.cols_to_read]
+            dataframe_to_validate.columns = self.cols_to_validate # sets the headers to standard format if neeeded
+
+            # Schema validation
+            self.validate_schema(self.schema_validators,dataframe_to_validate)
+
+            self.process_errors()
+            if len(self.bad_rows) >= self.error_limit:
+                break
+
+        if not self.bad_rows and not self.global_errors and self.is_valid:
+            self.logger.info("File is valid")
+        else:
+            self.logger.info("File is invalid - {} bad rows, limit set to {}".format(len(self.bad_rows), self.error_limit))
+            self.set_file_is_invalid()
+        return self.is_valid
+
+
+    def is_file_valid(self) -> bool:
+        ''' Method returning the boolean value: True if the file is valid, False if the file is invalid. '''
+        return self.is_valid
+
+    def set_file_is_invalid(self):
+        ''' Set the flag "is_valid" to False. '''
+        self.is_valid = False
+
+
+    def process_errors(self):
+        ''' Populate the logger error and the list of bad rows with the errors found. '''
+        for error in self.row_errors:
+            if len(self.bad_rows) < self.error_limit or self.error_limit < 1:
+                self.logger.error(error)
+                if error.row not in self.bad_rows:
+                    self.bad_rows.append(error.row)
+        self.row_errors = []
+
+
+    def store_errors(self, errors: List[pandas_schema.validation_warning.ValidationWarning]):
+        ''' Capture the errors found into a temporary structure before being processed. '''
+        for error in errors:
+            seen = 0
+            row_number = error.row
+            file_line_number = row_number + self.comment_lines_count + 1 # rows are 0 indexes
+            error.row = str(row_number) + " (line "+str(file_line_number)+")"
+            col = error.column
+            # Avoid duplication as the errors can be detected several times
+            if row_number in self.errors_seen.keys():
+                if col in self.errors_seen[row_number].keys():
+                    seen = 1
+                else:
+                    self.errors_seen[row_number][col] = 1
+            else:
+                self.errors_seen[row_number] = { col : 1 }
+            if seen == 0:
+                self.row_errors.append(error)
+
+
+    def validate_file_extension(self):
+        ''' Check/validate the file name extension. '''
+        self.logger.info("Validating file extension...")
+        check_exts = [self.check_ext(ext) for ext in self.valid_extensions]
+        if not any(check_exts):
+            self.valid_ext = False
+            self.set_file_is_invalid()
+            self.logger.info("Invalid file extension: {}".format(self.file))
+            self.logger.error("File extension should be in {}".format(self.valid_extensions))
+        else:
+            self.valid_ext = True
+        return self.valid_ext
+
+
+    def compare_number_of_rows(self):
+        ''' Compare the number of data rows between the harmonized and the formatted scoring files. '''
+        # Harmonization file - length
+        hm_rows_count = 0
+        for chunk in self.df_iterator(self.file):
+            hm_rows_count += len(chunk.index)
+        gc.collect()
+
+        # Formatted scoring file - length
+        scoring_rows_count = 0
+        scoring_file = f'{self.score_dir}/{self.pgs_id}.txt.gz'
+        if os.path.isfile(scoring_file):
+            for score_chunk in self.df_iterator(scoring_file):
+                scoring_rows_count += len(score_chunk.index)
+            gc.collect()
+
+        comparison_status = True
+        if scoring_rows_count == 0:
+            self.logger.error(f"Can't find the Scoring file '{scoring_file}' to compare the number of rows with the harmonization file!")
+            comparison_status = False
+        elif hm_rows_count != scoring_rows_count:
+            self.logger.error(f'The number of data rows between the Scoring file ({scoring_rows_count}) and the Harmonization POS file ({hm_rows_count}) are different')
+            comparison_status = False
+        return comparison_status
+
+
+    def compare_with_filename(self):
+        ''' Check that the filename matches the information present in the file metadata (PGS ID, genome build). '''
+        self.logger.info("Comparing filename with metadata...")
+        comparison_status = True
+        if hasattr(self,'file_genomebuild') and hasattr(self,'file_pgs_id'):
+            # Extract some metadata
+            self.get_genomebuild()
+            self.get_pgs_id()
+            # Compare metadata with filename information
+            if self.file_genomebuild != self.genomebuild:
+                self.logger.error("Build: the genome build in the HmPOS_build header ({}) is different from the one on the filename ({})".format(self.genomebuild,self.file_genomebuild))
+                comparison_status = False
+            if self.file_pgs_id != self.pgs_id:
+                self.logger.error("ID: the PGS ID of the header ({}) is different from the one on the filename ({})".format(self.pgs_id,self.file_pgs_id))
+                comparison_status = False
+            # Compare number of rows with Scoring file
+            if self.score_dir:
+                row_comparison_status = self.compare_number_of_rows()
+                if row_comparison_status == False:
+                    comparison_status = row_comparison_status
+            else:
+                self.logger.info("Comparison of the number of rows between Harmonized and Scoring file skipped!")
+            if not comparison_status:
+                self.logger.info("Discrepancies between filename information and metadata: {}".format(self.file))                
+                self.set_file_is_invalid()
+        return comparison_status
+
+
+    def df_iterator(self, data_file: str):
+        ''' Setup a pandas dataframe iterator. '''
+        df = pd.read_csv(data_file,
+                         sep=self.sep,
+                         dtype=str,
+                         comment='#',
+                         chunksize=1000000)
+        return df
+
+
+    def check_file_is_square(self, csv_file: str):
+        ''' Check that each row has the name number of columns. '''
+        square = True
+        csv_file.seek(0)
+        reader = csv.reader(csv_file, delimiter=self.sep)
+        count = 1
+        for row in reader:
+            if len(row) != 0:
+                if row[0].startswith('#'):
+                    self.comment_lines_count += 1
+                    continue
+            if (len(row) != len(self.header)):
+                self.logger.error("Length of row {c} is: {l} instead of {h}".format(c=count, l=str(len(row)), h=str(len(self.header))))
+                self.logger.error("ROW: "+str(row))
+                square = False
+            count += 1
+        del csv_file
+        return square
+
+
+    def open_file_and_check_for_squareness(self):
+        ''' Method to read the file in order to check that each row has the name number of columns. '''
+        if pathlib.Path(self.file).suffix in [".gz", ".gzip"]:
+             with gzip.open(self.file, 'rt') as f:
+                 return self.check_file_is_square(f)
+        else:
+            with open(self.file) as f:
+                 return self.check_file_is_square(f)
+
+
+    def check_leading_trailing_spaces(self, cols:str, line_number:str = None):
+        '''
+        Check if the columns have leading and/or trailing spaces.
+        The leading/trailing spaces should raise an error during the validation.
+        '''
+        leading_trailing_spaces = []
+        found_trailing_spaces = False
+        for idx, col in enumerate(cols):
+            if col.startswith(' ') or col.endswith(' '):
+                leading_trailing_spaces.append(self.header[idx]+' => |'+str(col)+'|')
+        if len(leading_trailing_spaces):
+            if line_number:
+                line_name = f'line {line_number} has'
+            else:
+                line_name = 'following headers have'
+            self.logger.error("The "+line_name+" leading and/or trailing spaces: "+' ; '.join(leading_trailing_spaces))
+            found_trailing_spaces = True
+        return found_trailing_spaces
+
+
+    def check_ext(self, ext:str) -> bool:
+        if self.file.endswith(ext):
+            return True
+        return False
+
+
+    def check_build_is_legit(self, build:str) -> bool:
+        if build in BUILD_LIST:
+            return True
+        return False
+
+    
+    def get_comments_info(self, type:str) -> str:
+        ''' Retrieve information from the comments '''
+        with gzip.open(self.file, 'rb') as f_in:
+            for f_line in f_in:
+                line = f_line.decode()
+                # Update header
+                if line.startswith(type):
+                    info = (line.split('='))[1]
+                    return info.strip()
+
+    def run_generic_validator(self,check_filename):
+        self.logger.propagate = False
+
+        # Check files exist
+        if not self.file or not self.logfile:
+            self.logger.info("Missing file and/or logfile")
+            self.set_file_is_invalid()
+        elif self.file and not os.path.exists(self.file):
+            self.logger.info("Error: the file '"+self.file+"' can't be found")
+            self.set_file_is_invalid()
+
+        # Validate file extension
+        self.validate_file_extension()
+
+        # Validate file name nomenclature
+        if self.is_file_valid() and check_filename:
+            self.validate_filename()
+
+        # Only for harmonized files
+        if self.is_file_valid() and type(self).__name__ != 'ValidatorFormatted':
+            self.compare_with_filename()
+
+        # Validate column headers
+        if self.is_file_valid():
+            self.validate_headers()
+
+        # Validate data content
+        if self.is_file_valid():
+            self.validate_data()
+
+        # Close log handler
+        self.logger.removeHandler(self.handler)
+        self.handler.close()
+
+    def run_validator(self):
+        self.run_generic_validator(True)
+
+    def run_validator_skip_check_filename(self):
+        self.run_generic_validator(False)
+
+
+    def validate_filename(self):
+        ''' Validate the file name structure. '''
+        print("To be implemented in inherited classes")
+        pass
+
+
+    def validate_headers(self):
+        ''' Validate the list of column names. '''
+        print("To be implemented in inherited classes")
+        pass
+
+
+    def validate_line_content(self, cols_content:str, var_line_number:int):
+        ''' Validate each data row. '''
+        print("To be implemented in inherited classes")
+        pass
+
+
+    def extract_specific_metadata(self, line:str):
+        ''' Extra method to extract and validate specific data. '''
+        print("To be implemented in inherited classes")
+        pass
+