From 552bc5d4a3573d727bbb9d8f1c559d2c3c2474d4 Mon Sep 17 00:00:00 2001 From: Marina D'Amato Date: Tue, 7 Nov 2023 20:08:32 +0100 Subject: [PATCH] Added code to merge bibfiles --- scripts/MergeChanges.ipynb | 196 +++++++++++++++++++++++++++++++++++++ 1 file changed, 196 insertions(+) create mode 100644 scripts/MergeChanges.ipynb diff --git a/scripts/MergeChanges.ipynb b/scripts/MergeChanges.ipynb new file mode 100644 index 0000000..71a82db --- /dev/null +++ b/scripts/MergeChanges.ipynb @@ -0,0 +1,196 @@ +{ + "cells": [ + { + "cell_type": "code", + "execution_count": null, + "id": "d377a755", + "metadata": {}, + "outputs": [], + "source": [ + "import pandas as pd\n", + "import os\n", + "from bib_handling_code.processbib import read_bibfile\n", + "from bib_handling_code.processbib import save_to_file" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "f6cf2984", + "metadata": {}, + "outputs": [], + "source": [ + "def from_bib_to_csv(diag_bib_raw):\n", + " bib_data = []\n", + " bib_columns = ['bibkey', 'type', 'title', 'authors', 'doi', 'gs_citations', 'journal', 'year', 'all_ss_ids', 'pmid']\n", + " \n", + " for bib_entry in diag_bib_raw:\n", + " if bib_entry.type == 'string':\n", + " continue\n", + "\n", + " bibkey = bib_entry.key\n", + " bib_type = bib_entry.type\n", + " fields = bib_entry.fields\n", + " \n", + " bib_authors = fields.get('author', '').strip('{}')\n", + " bib_title = fields.get('title', '').strip('{}')\n", + " bib_doi = fields.get('doi', '').strip('{}')\n", + " bib_gscites = fields.get('gscites', '').strip('{}')\n", + " bib_journal = fields.get('journal', '').strip('{}')\n", + " bib_year = fields.get('year', '').strip('{}')\n", + " bib_all_ss_ids = fields.get('all_ss_ids', '').strip('{}')\n", + " bib_pmid = fields.get('pmid', '').strip('{}')\n", + " \n", + " bib_data.append([bibkey, bib_type, bib_title, bib_authors, bib_doi, bib_gscites, bib_journal, bib_year, bib_all_ss_ids, bib_pmid])\n", + "\n", + " df_bib_data = pd.DataFrame(bib_data, columns=bib_columns)\n", + " return df_bib_data" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "5b5596e1", + "metadata": {}, + "outputs": [], + "source": [ + "path_diag_bib = os.path.join('..', 'diag.bib')\n", + "diag_bib_raw = read_bibfile(None, path_diag_bib)\n", + "bibfile = from_bib_to_csv(diag_bib_raw)\n", + "path_new_bib = os.path.join('script_data/', 'diag_ss_new.bib')\n", + "updated_bib_raw = read_bibfile(None, path_new_bib)\n", + "newbibfile = from_bib_to_csv(updated_bib_raw)" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "98489bba", + "metadata": {}, + "outputs": [], + "source": [ + "new_bibkeys = newbibfile['bibkey'].tolist()" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "2374dcc9", + "metadata": {}, + "outputs": [], + "source": [ + "old_bibkeys = bibfile['bibkey'].tolist()" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "f38e2c4c", + "metadata": {}, + "outputs": [], + "source": [ + "def get_entry(bibfile, bibkey_toupdate):\n", + " for entry in bibfile:\n", + " bibkey = entry.key\n", + " if bibkey == bibkey_toupdate:\n", + " return entry" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "366debbe", + "metadata": {}, + "outputs": [], + "source": [ + "# Update existing bibitems\n", + "for entry in diag_bib_raw:\n", + " if entry.type == 'string':\n", + " continue\n", + " bibkey = entry.key\n", + " if bibkey in new_bibkeys:\n", + " corresponding_entry = get_entry(updated_bib_raw, bibkey)\n", + " if 'all_ss_ids' in corresponding_entry.fields:\n", + " entry.fields['all_ss_ids'] = corresponding_entry.fields['all_ss_ids']\n", + " if 'pmid' in corresponding_entry.fields:\n", + " entry.fields['pmid'] = corresponding_entry.fields['pmid']" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "21522e57", + "metadata": {}, + "outputs": [], + "source": [ + "bibkeys_toadd = set(new_bibkeys)-set(old_bibkeys)\n", + "df_to_add = newbibfile[newbibfile['bibkey'].isin(bibkeys_toadd)]" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "3c023620", + "metadata": {}, + "outputs": [], + "source": [ + "bibkeys_toadd" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "82d6aeeb", + "metadata": {}, + "outputs": [], + "source": [ + "for entry in updated_bib_raw:\n", + " bibkey = entry.key\n", + " if bibkey in bibkeys_toadd:\n", + " diag_bib_raw.append(entry)" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "6224df11", + "metadata": {}, + "outputs": [], + "source": [ + "csv=from_bib_to_csv(diag_bib_raw)" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "cf941ddf", + "metadata": {}, + "outputs": [], + "source": [ + "path_diag_bib = os.path.join('..', 'diag_latest_try.bib')\n", + "save_to_file(diag_bib_raw, path_diag_bib)" + ] + } + ], + "metadata": { + "kernelspec": { + "display_name": "Python 3 (ipykernel)", + "language": "python", + "name": "python3" + }, + "language_info": { + "codemirror_mode": { + "name": "ipython", + "version": 3 + }, + "file_extension": ".py", + "mimetype": "text/x-python", + "name": "python", + "nbconvert_exporter": "python", + "pygments_lexer": "ipython3", + "version": "3.10.9" + } + }, + "nbformat": 4, + "nbformat_minor": 5 +}