Skip to content

Commit

Permalink
Merge pull request #37 from BasVerlooy/feat/add-more-tables
Browse files Browse the repository at this point in the history
Add remaining Pubmed tables
  • Loading branch information
dspinellis authored Jan 16, 2024
2 parents 4a635a7 + 0b9cd6b commit e0a05bb
Show file tree
Hide file tree
Showing 32 changed files with 4,736 additions and 2,381 deletions.
2 changes: 1 addition & 1 deletion bin/update-python-api
Original file line number Diff line number Diff line change
Expand Up @@ -127,4 +127,4 @@ Generated schema
$(bin/a3k list-process-schema $process | sed 's/^/ /')
EOF
done
done
4 changes: 2 additions & 2 deletions bin/update-schema
Original file line number Diff line number Diff line change
Expand Up @@ -7,7 +7,7 @@ set -e

for format in svg pdf ; do
# Create individual schema diagrams
for schema in crossref orcid ror uspto ; do
for schema in crossref orcid ror uspto pubmed ; do
(
cat docs/schema/schema-head.dot docs/schema/$schema.dot
bin/a3k list-source-schema $schema | bin/schema2dot.sed
Expand All @@ -27,7 +27,7 @@ for format in svg pdf ; do

# Create combined schema diagram
(
( cd docs/schema ; cat schema-head.dot crossref.dot orcid.dot ror.dot uspto.dot other.dot )
( cd docs/schema ; cat schema-head.dot crossref.dot orcid.dot ror.dot uspto.dot pubmed.dot other.dot )
( bin/a3k list-source-schema ; bin/a3k list-process-schema ) |
bin/schema2dot.sed
) | dot -T$format -o docs/schema/all.$format
Expand Down
2 changes: 2 additions & 0 deletions docs/dev.rst
Original file line number Diff line number Diff line change
Expand Up @@ -108,6 +108,8 @@ documentation must be updated as follows.
# While in the top-level directory
bin/update-python-api
For users on MacOS, this might require installing the
`GNU sed <https://formulae.brew.sh/formula/gnu-sed>`__ package.

Application examples documentation
~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
Expand Down
13 changes: 13 additions & 0 deletions docs/downloading.rst
Original file line number Diff line number Diff line change
Expand Up @@ -82,6 +82,19 @@ The shell script ``examples/common/fetch-uspto.sh`` or the
``$(USPTO_DIR)`` dependency of the ``examples/common/Makefile``,
which uses it can be used to download all USPTO data.

Pubmed data
~~~~~~~~~~~

You can populate a database with the data from the Pubmed/Medline database
from the National Library of Medicine (NLM). The data is available from
`here <https://ftp.ncbi.nlm.nih.gov/pubmed/baseline/>`__.
You can also download the data from the FTP server which is documented in
`this readme file <https://ftp.ncbi.nlm.nih.gov/pubmed/baseline/README.txt>`__.
Some data is overlapping with the Crossref dataset, but the Pubmed data
contains additional information such as MeSH terms and grants. Also it complements
the Crossref data as for example some affiliations are only available in the Pubmed data.


Other data sources
~~~~~~~~~~~~~~~~~~

Expand Down
250 changes: 250 additions & 0 deletions docs/pubmed.rst
Original file line number Diff line number Diff line change
@@ -0,0 +1,250 @@
Pubmed publication data
=======================

.. Automatically generated file. Do not modify by hand.
.. code:: py
from alexandria3k.data_sources import pubmed
.. autoclass:: data_sources.pubmed.Pubmed
:members: query, populate

Generated schema
----------------

.. code:: sql
CREATE TABLE pubmed_articles(
id,
container_id,
pubmed_id,
doi,
publisher_item_identifier_article_id,
pmc_article_id,
journal_title,
journal_issn,
journal_issn_type,
journal_cited_medium,
journal_volume INTEGER,
journal_issue INTEGER,
journal_year INTEGER,
journal_month INTEGER,
journal_day INTEGER,
journal_medline_date,
journal_ISO_abbreviation,
article_date_year INTEGER,
article_date_month INTEGER,
article_date_day INTEGER,
article_date_type,
pagination,
elocation_id,
elocation_id_type,
elocation_id_valid,
language,
title,
vernacular_title,
journal_country,
medline_ta,
nlm_unique_id,
issn_linking,
article_pubmodel,
citation_subset,
completed_year INTEGER,
completed_month INTEGER,
completed_day INTEGER,
revised_year INTEGER,
revised_month INTEGER,
revised_day INTEGER,
coi_statement,
medline_citation_status,
medline_citation_owner,
medline_citation_version,
medline_citation_indexing_method,
medline_citation_version_date,
keyword_list_owner,
publication_status,
abstract_copyright_information,
other_abstract_copyright_information
);
CREATE TABLE pubmed_authors(
id,
container_id,
article_id,
given,
family,
suffix,
initials,
valid,
identifier,
identifier_source,
collective_name
);
CREATE TABLE pubmed_author_affiliations(
id,
container_id,
author_id,
affiliation,
identifier
);
CREATE TABLE pubmed_investigators(
id,
container_id,
article_id,
given,
family,
suffix,
initials,
valid,
identifier,
identifier_source
);
CREATE TABLE pubmed_investigator_affiliations(
id,
container_id,
investigator_id,
affiliation,
identifier
);
CREATE TABLE pubmed_abstracts(
id,
container_id,
article_id,
label,
text,
nlm_category,
copyright_information
);
CREATE TABLE pubmed_other_abstracts(
id,
container_id,
article_id,
abstract_type,
language
);
CREATE TABLE pubmed_other_abstract_texts(
id,
container_id,
abstract_id,
text,
label,
nlm_category,
copyright_information
);
CREATE TABLE pubmed_history(
id,
container_id,
article_id,
publication_status,
year INTEGER,
month INTEGER,
day INTEGER,
hour INTEGER,
minute INTEGER
);
CREATE TABLE pubmed_chemicals(
id,
container_id,
article_id,
registry_number,
name_of_substance,
unique_identifier
);
CREATE TABLE pubmed_meshs(
id,
container_id,
article_id,
descriptor_name,
descriptor_unique_identifier,
descriptor_major_topic,
descriptor_type,
qualifier_name,
qualifier_major_topic,
qualifier_unique_identifier
);
CREATE TABLE pubmed_supplement_meshs(
id,
container_id,
article_id,
supplement_mesh_name,
unique_identifier,
mesh_type
);
CREATE TABLE pubmed_comments_corrections(
id,
container_id,
article_id,
ref_type,
ref_source,
pmid,
pmid_version,
note
);
CREATE TABLE pubmed_keywords(
id,
container_id,
article_id,
keyword,
major_topic
);
CREATE TABLE pubmed_grants(
id,
container_id,
article_id,
grant_id,
acronym,
agency,
country
);
CREATE TABLE pubmed_data_banks(
id,
container_id,
article_id,
data_bank_name
);
CREATE TABLE pubmed_data_bank_accessions(
id,
container_id,
data_bank_id,
accession_number
);
CREATE TABLE pubmed_references(
id,
container_id,
article_id,
citation
);
CREATE TABLE pubmed_reference_articles(
id,
container_id,
reference_id,
article_id,
id_type
);
CREATE TABLE pubmed_publication_types(
id,
container_id,
article_id,
publication_type,
unique_identifier
);
18 changes: 7 additions & 11 deletions docs/ror.rst
Original file line number Diff line number Diff line change
Expand Up @@ -21,7 +21,13 @@ Generated schema
name,
status,
established,
country_code
grid,
address_city,
address_state,
address_postcode,
address_country_code,
address_lat,
address_lng
);
CREATE TABLE ror_types(
Expand Down Expand Up @@ -55,16 +61,6 @@ Generated schema
ror_path
);
CREATE TABLE ror_addresses(
id,
ror_id,
lat,
lng,
city,
state,
postcode
);
CREATE TABLE ror_funder_ids(
id,
ror_id,
Expand Down
1 change: 1 addition & 0 deletions docs/schema/.gitignore
Original file line number Diff line number Diff line change
Expand Up @@ -4,3 +4,4 @@ orcid.pdf
other.pdf
ror.pdf
uspto.pdf
pubmed.pdf
Loading

0 comments on commit e0a05bb

Please sign in to comment.