doi2bibtex

#!/bin/sh

# doi2bibtex - Get BibTeX from DOI
#
# Authors: Felix Ospald <felix.ospald@mathematik.tu-chemnitz.de>
#          Gerd Wachsmuth <gerd.wachsmuth@mathematik.tu-chemnitz.de>
# Version: 2017.1
# Licence: GPL v3

# maybe interesting (uses javascript):
# http://www.doi2bib.org/#/doi

# Issue help message if necessary
if [ $# = 0 ]; then
	echo "Usage: $(basename $0) DOI"
	echo "or:    $(basename $0) http://dx.doi.org/DOI"
	echo "will try and retrieve the BibTeX file for the DOI given."
	exit 1
fi

# Extract doi. Pattern from https://github.com/regexhq/doi-regex/blob/master/index.js
# DOI=$(echo $1 | grep '10[.][0-9]{4,}(?:[.][0-9]+)*/(?:(?![%"#? ])\S)+' -P -o )
# https://www.crossref.org/blog/dois-and-matching-regular-expressions/
DOI=$(echo $1 | egrep '10[.][0-9]{4,9}/[-._;()/:a-zA-Z0-9<>]+' -o )

BIBTEX=$(curl -LH "Accept: application/x-bibtex" "https://doi.org/$DOI" 2> /dev/null)
# env echo "$BIBTEX"

PROG1='
BEGIN {
	IGNORECASE = 1;
	delete_this_entry = 0;
};
/^\s*@/ {
	# Print the header of the entry.
	gsub(" ", "");
	print(toupper($0));
	next;
}
/^\s*(MRNUMBER|MRREVIEWER|MRCLASS|CODEN|URL\s*=\s*{https?:\/\/(dx\.)?doi\.org)/ {
	# Delete certain unnecessary entries.
	delete_this_entry = 1;
	next;
};
/^\s*[A-Z]+\s*=/ {
	# Do not delete the remaining entries.
	delete_this_entry = 0;
	print;
	next;
};
/^}$/ {
	print;
	next;
};
// {
	if (delete_this_entry == 0)
		print;
};'

PROG2='
/@[A-Z]*{[A-Za-z0-9._]*,/{
	IGNORECASE = 1;
	# Save the header and init some variables.
	head=substr($0,0,match($0,"{"));
	body="";
	authors_active=0;
	have_read_authors=0;
	year=0;
	next;
}
/^}$/{
	# This is the end of the entry. Print everything!
	body=body $0;
	printf "%s",head;
	# Create a nice label (authors or editors and year) for the entry.
	count=split(authors,author_array," +and +");
	for( i=1; i<=count; i++ ) {
		# First, try whether the names are given as "lastname, firstname"
		idx=match(author_array[i],", ");
		lastname=substr(author_array[i],0,idx-1);
		if( idx == 0 ) {
			# Now, try for "firstname lastname"
			idx=match(author_array[i]," [^ ]*$");
			lastname=substr(author_array[i],idx+1,length(author_array[i]));
		}
		lastname=gensub("{\\\\\"([a-zA-Z])}","\\1e","g", lastname); # test for umlauts
		lastname=gensub("{\\\\ss}","ss","g", lastname);             # test for eszett
		lastname=gensub("{[^{}]*([a-zA-Z])}","\\1","g", lastname);  # test for accents (this is needed two times, e.g. for "{\v{c}}")
		lastname=gensub("{[^{}]*([a-zA-Z])}","\\1","g", lastname);
		lastname=gensub("'\''","","g", lastname);                   # Remove apostroph
		lastname=gensub("},$","","g", lastname);                    # Remove closing }
		word_count = split( lastname, lastname_array, " ");         # Split lastname into words and capitalize each of them
		lastname = "";
		for( j=1; j<= word_count; j++ ) {                           # Fuer Juan Carlos ;)
			lastname = lastname toupper(substr(lastname_array[j],1,1)) substr(lastname_array[j],2,1000);
		}
		printf "%s",lastname;
	};
	print year ",";
	print body;
	next;
}
/ *[A-Z]* =/{
	if( toupper($1) == "AUTHOR" ) {
		# If we already have read the editors, we overwrite them.
		authors=substr($0,match($0,"{")+1,100);
		authors_active=1;
		have_read_authors=1;
	}
	else if( (toupper($1) == "EDITOR") && !have_read_authors){
		# If we already have read the authors, we skip the editors.
		authors=substr($0,match($0,"{")+1,100);
		authors_active=1;
	}
	else {
		authors_active=0;
		if(toupper($1) == "YEAR"){
			year=substr($0,match($0,"[[0-9]]*"),4)
		}
	};
}
/^[^=]*$/{
	if(authors_active){
		authors=authors $0;
	}
}
//{
	body=body $0 "\n";
	next;
}'

# Remove month field from output
BIBTEX=$(echo "$BIBTEX" | sed -E 's/month *= *[a-z]{3} *, *//')
# Also if it is the last entry
BIBTEX=$(echo "$BIBTEX" | sed -E 's/, *month *= *[a-z]{3} *//')

if [ $(echo "$BIBTEX" | wc -l) -lt 2 ];
then
	# Insert line breaks
	BIBTEX=$(echo "$BIBTEX" | sed 's/,/&\n/' | sed 's/},/&\n/g' | sed 's/ *}$/\n}/')
fi

# Fix "="
BIBTEX=$(echo "$BIBTEX" | sed 's/ *= */ = /')

env echo "$BIBTEX" | awk "$PROG1" | gawk "$PROG2"

# add a newline
echo