Skip to content

Commit

Permalink
Refactor combine_journal_lists scripts to improve quality
Browse files Browse the repository at this point in the history
  • Loading branch information
northword committed Oct 4, 2024
1 parent 10e6244 commit 3a8983a
Show file tree
Hide file tree
Showing 2 changed files with 163 additions and 51 deletions.
97 changes: 76 additions & 21 deletions scripts/combine_journal_lists_dotless.py
Original file line number Diff line number Diff line change
Expand Up @@ -3,7 +3,7 @@
"""
Python script for combining several journal abbreviation lists
and producing an alphabetically sorted list. If the same journal
names are repeated, only the version found last is retained.
names are repeated, only the version found first is retained.
This version of the script specifically combines the lists following the ISO4
standard WITHOUT dots after abbreviated words.
Expand All @@ -13,37 +13,92 @@
Output: writes file 'journalList_dotless.csv'
"""

import csv
import json
from pathlib import Path
import re
import sys
import pandas as pd

# Define the list of CSV files
import_order = [
'journals/journal_abbreviations_entrez.csv',
'journals/journal_abbreviations_medicus.csv',
'journals/journal_abbreviations_webofscience-dotless.csv'
"journals/journal_abbreviations_entrez.csv",
"journals/journal_abbreviations_medicus.csv",
"journals/journal_abbreviations_webofscience-dotless.csv",
]


def main(output_filename):
# Read and merge CSV files
# dfs = [pd.read_csv(file, header=None) for file in import_order]
dfs = []
for file in import_order:
df = pd.read_csv(file, header=None)
dfs.append(df)
print(f"{file}: {len(df)}")
merged_df = pd.concat(dfs, ignore_index=True)
def load_data(file_paths):
"""Load and combine data from CSV files."""
journal_dict = {}
normalized_keys = set()
for path in file_paths:
with open(path, mode="r", encoding="utf-8") as file:
reader = csv.reader(file)
for row in reader:
name = row[0].strip()
abbr = row[1].strip()

# Drop duplicates based on the first column value and keep the last one obtained
merged_df.drop_duplicates(subset=[0], keep='last', inplace=True)
# Discard entries where name or abbr is missing
if not (name and abbr):
continue
# Discard entries that are too long or too short
if len(name) >= 80 or len(name) <= 3:
continue
# Discard names that start with non-alphanumeric characters
if not name[0].isalnum():
continue
# Discard names that consist only of numbers
if name.replace(" ", "").isnumeric():
continue
# Discard names containing \
if name.count("\\"):
continue
# Discard entries where the first letters of name and abbr do not match
if abbr[0] != name.replace("The", "").replace("A ", "")[0]:
continue
# Only keep the first occurrence
if name in journal_dict:
continue
# Generate normalizedKey, keeping only the first match
normalized_key = normalize_name(name)
if normalized_key in normalized_keys:
continue

# Sort alphabetically
sorted_df = merged_df.sort_values(by=[0])
journal_dict[name] = abbr
normalized_keys.add(normalized_key) # Add to the set of used keys
return journal_dict

# Save the result to the specified CSV file and ensure values are quoted
sorted_df.to_csv(output_filename, index=False, header=False, quoting=1)

print(f"Write {output_filename}, Combined key count: {len(merged_df)}")
def normalize_name(name):
"""
Normalize the journal name by removing specified characters using regex.
See src/utils/str.ts -> normalizeKey()
"""
return re.sub(r"\b(the|and)\b|[&\-:, ()]", "", name, flags=re.IGNORECASE).lower()


def save_to_json(data, output_file):
"""Save the data to a JSON file."""
with open(output_file, mode="w", encoding="utf-8") as json_file:
json.dump(data, json_file, indent=2, ensure_ascii=False)


def save_to_csv(data, output_file):
"""Save the data to a CSV file."""
with open(output_file, mode="w", newline="", encoding="utf-8") as csv_file:
writer = csv.writer(csv_file, quoting=1)
for name, abbr in data.items():
writer.writerow([name, abbr])


def main(filename):
base_path = Path().cwd()
output_filename = base_path / filename
import_paths = [base_path / file for file in import_order]

journal_data = load_data(import_paths)
sorted_journal_data = dict(sorted(journal_data.items())) # Sort alphabetically
save_to_csv(sorted_journal_data, output_filename)


if __name__ == "__main__":
Expand Down
117 changes: 87 additions & 30 deletions scripts/combine_journal_lists_dots.py
Original file line number Diff line number Diff line change
Expand Up @@ -3,7 +3,7 @@
"""
Python script for combining several journal abbreviation lists
and producing an alphabetically sorted list. If the same journal
names are repeated, only the version found last is retained.
names are repeated, only the version found first is retained.
This version of the script specifically combines the lists following the ISO4
standard WITH dots after abbreviated words.
Expand All @@ -13,45 +13,102 @@
Output: writes file 'journalList_dots.csv' (or specified output file)
"""

import csv
import json
from pathlib import Path
import re
import sys
import pandas as pd

# Define the list of CSV files
import_order = [
'journals/journal_abbreviations_acs.csv',
'journals/journal_abbreviations_ams.csv',
'journals/journal_abbreviations_general.csv',
'journals/journal_abbreviations_geology_physics.csv',
'journals/journal_abbreviations_ieee.csv',
'journals/journal_abbreviations_lifescience.csv',
'journals/journal_abbreviations_mathematics.csv',
'journals/journal_abbreviations_mechanical.csv',
'journals/journal_abbreviations_meteorology.csv',
'journals/journal_abbreviations_sociology.csv',
'journals/journal_abbreviations_webofscience-dots.csv'
# Keep IEEE before ubc, because IEEE has its own style.
"journals/journal_abbreviations_ieee.csv",
"journals/journal_abbreviations_acs.csv",
# Keep ubc before other jabref's, because ubc's data is more accurate.
"journals/journal_abbreviations_ubc.csv",
"journals/journal_abbreviations_ams.csv",
"journals/journal_abbreviations_general.csv",
"journals/journal_abbreviations_geology_physics.csv",
"journals/journal_abbreviations_lifescience.csv",
"journals/journal_abbreviations_mathematics.csv",
"journals/journal_abbreviations_mechanical.csv",
"journals/journal_abbreviations_meteorology.csv",
"journals/journal_abbreviations_sociology.csv",
"journals/journal_abbreviations_webofscience-dots.csv",
]


def main(output_filename):
# Read and merge CSV files
# dfs = [pd.read_csv(file, header=None) for file in import_order]
dfs = []
for file in import_order:
df = pd.read_csv(file, header=None)
dfs.append(df)
print(f"{file}: {len(df)}")
merged_df = pd.concat(dfs, ignore_index=True)
def load_data(file_paths):
"""Load and combine data from CSV files."""
journal_dict = {}
normalized_keys = set()
for path in file_paths:
with open(path, mode="r", encoding="utf-8") as file:
reader = csv.reader(file)
for row in reader:
name = row[0].strip()
abbr = row[1].strip()

# Drop duplicates based on the first column value and keep the last one obtained
merged_df.drop_duplicates(subset=[0], keep='last', inplace=True)
# Discard entries where name or abbr is missing
if not (name and abbr):
continue
# Discard entries that are too long or too short
if len(name) >= 80 or len(name) <= 3:
continue
# Discard names that start with non-alphanumeric characters
if not name[0].isalnum():
continue
# Discard names that consist only of numbers
if name.replace(" ", "").isnumeric():
continue
# Discard names containing \
if name.count("\\"):
continue
# Discard entries where the first letters of name and abbr do not match
if abbr[0] != name.replace("The", "").replace("A ", "")[0]:
continue
# Only keep the first occurrence
if name in journal_dict:
continue
# Generate normalizedKey, keeping only the first match
normalized_key = normalize_name(name)
if normalized_key in normalized_keys:
continue

# Sort alphabetically
sorted_df = merged_df.sort_values(by=[0])
journal_dict[name] = abbr
normalized_keys.add(normalized_key) # Add to the set of used keys
return journal_dict

# Save the result to the specified CSV file and ensure values are quoted
sorted_df.to_csv(output_filename, index=False, header=False, quoting=1)

print(f"Write {output_filename}, Combined key count: {len(merged_df)}")
def normalize_name(name):
"""
Normalize the journal name by removing specified characters using regex.
See src/utils/str.ts -> normalizeKey()
"""
return re.sub(r"\b(the|and)\b|[&\-:, ()]", "", name, flags=re.IGNORECASE).lower()


def save_to_json(data, output_file):
"""Save the data to a JSON file."""
with open(output_file, mode="w", encoding="utf-8") as json_file:
json.dump(data, json_file, indent=2, ensure_ascii=False)


def save_to_csv(data, output_file):
"""Save the data to a CSV file."""
with open(output_file, mode="w", newline="", encoding="utf-8") as csv_file:
writer = csv.writer(csv_file, quoting=1)
for name, abbr in data.items():
writer.writerow([name, abbr])


def main(filename):
base_path = Path().cwd()
output_filename = base_path / filename
import_paths = [base_path / file for file in import_order]

journal_data = load_data(import_paths)
sorted_journal_data = dict(sorted(journal_data.items())) # Sort alphabetically
save_to_csv(sorted_journal_data, output_filename)


if __name__ == "__main__":
Expand Down

0 comments on commit 3a8983a

Please sign in to comment.