Skip to content

Commit

Permalink
Merge pull request #39 from johnwmillr/add-filter-for-non-songs
Browse files Browse the repository at this point in the history
Add heuristic method for removing non-songs
  • Loading branch information
johnwmillr authored Jun 4, 2018
2 parents 5ecdca3 + 399ea28 commit 479a103
Show file tree
Hide file tree
Showing 2 changed files with 127 additions and 87 deletions.
212 changes: 126 additions & 86 deletions lyricsgenius/api.py
Original file line number Diff line number Diff line change
Expand Up @@ -10,9 +10,12 @@
# print(artist.songs[-1])

import sys
from urllib.request import Request, urlopen, quote # Python 3
import os, re
import requests, socket, json
from urllib.request import Request, urlopen, quote
import os
import re
import requests
import socket
import json
from bs4 import BeautifulSoup
from string import punctuation
import time
Expand All @@ -21,137 +24,163 @@
from .song import Song
from .artist import Artist


class _API(object):
# This is a superclass that Genius() inherits from. Not sure if this makes any sense, but it
# seemed like a good idea to have this class (more removed from user) handle the lower-level
# interaction with the Genius API, and then Genius() has the more user-friendly search
# functions
"""Interface with the Genius.com API
Attributes:
base_url: (str) Top-most URL to access the Genius.com API with
Methods:
_load_credentials()
OUTPUT: client_id, client_secret, client_access_token
_make_api_request()
INPUT:
OUTPUT:
"""
INPUT:
OUTPUT:
"""

# Genius API constants
_API_URL = "https://api.genius.com/"
_API_URL = "https://api.genius.com/"
_API_REQUEST_TYPES =\
{'song': 'songs/', 'artist': 'artists/', 'artist-songs': 'artists/songs/','search': 'search?q='}

def __init__(self, client_access_token, client_secret='', client_id=''):
{'song': 'songs/', 'artist': 'artists/',
'artist-songs': 'artists/songs/', 'search': 'search?q='}

def __init__(self, client_access_token, client_secret='', client_id=''):
self._CLIENT_ACCESS_TOKEN = client_access_token
self._HEADER_AUTHORIZATION = 'Bearer ' + self._CLIENT_ACCESS_TOKEN

def _make_api_request(self, request_term_and_type, page=1):
"""Send a request (song, artist, or search) to the Genius API, returning a json object
INPUT:
request_term_and_type: (tuple) (request_term, request_type)
*request term* is a string. If *request_type* is 'search', then *request_term* is just
what you'd type into the search box on Genius.com. If you have an song ID or an artist ID,
you'd do this: self._make_api_request('2236','song')
Returns a json object.
"""

#The API request URL must be formatted according to the desired request type"""
api_request = self._format_api_request(request_term_and_type, page=page)

"""

# TODO: This should maybe be a generator

# The API request URL must be formatted according to the desired
# request type"""
api_request = self._format_api_request(
request_term_and_type, page=page)

# Add the necessary headers to the request
request = Request(api_request)
request = Request(api_request)
request.add_header("Authorization", self._HEADER_AUTHORIZATION)
request.add_header("User-Agent","LyricsGenius")
request.add_header("User-Agent", "LyricsGenius")
while True:
try:
response = urlopen(request, timeout=4) #timeout set to 4 seconds; automatically retries if times out
# timeout set to 4 seconds; automatically retries if times out
response = urlopen(request, timeout=4)
raw = response.read().decode('utf-8')
except socket.timeout:
print("Timeout raised and caught")
continue
break

return json.loads(raw)['response']

def _format_api_request(self, term_and_type, page=1):
"""Format the request URL depending on the type of request"""
"""Format the request URL depending on the type of request"""

request_term, request_type = str(term_and_type[0]), term_and_type[1]
request_term, request_type = str(term_and_type[0]), term_and_type[1]
assert request_type in self._API_REQUEST_TYPES, "Unknown API request type"

# TODO - Clean this up (might not need separate returns)
if request_type=='artist-songs':
if request_type == 'artist-songs':
return self._API_URL + 'artists/' + quote(request_term) + '/songs?per_page=50&page=' + str(page)
else:
else:
return self._API_URL + self._API_REQUEST_TYPES[request_type] + quote(request_term)

def _scrape_song_lyrics_from_url(self, URL, remove_section_headers=False):
"""Use BeautifulSoup to scrape song info off of a Genius song URL"""
page = requests.get(URL)
"""Use BeautifulSoup to scrape song info off of a Genius song URL"""
page = requests.get(URL)
html = BeautifulSoup(page.text, "html.parser")

# Scrape the song lyrics from the HTML
lyrics = html.find("div", class_="lyrics").get_text()
if remove_section_headers:
lyrics = re.sub('(\[.*?\])*', '', lyrics) # Remove [Verse] and [Bridge] stuff
lyrics = re.sub('\n{2}', '\n', lyrics) # Remove gaps between verses
# Remove [Verse] and [Bridge] stuff
lyrics = re.sub('(\[.*?\])*', '', lyrics)
# Remove gaps between verses
lyrics = re.sub('\n{2}', '\n', lyrics)

return lyrics.strip('\n')

def _clean(self, s):
return s.translate(str.maketrans('','',punctuation)).replace('\u200b', " ").strip().lower()
def _clean_str(self, s):
return s.translate(str.maketrans('', '', punctuation)).replace('\u200b', " ").strip().lower()

def _result_is_lyrics(self, song_title):
"""Returns False if result from Genius is not actually song lyrics"""
regex = re.compile(
r"(tracklist)|(track list)|(album art(work)?)|(liner notes)|(booklet)|(credits)", re.IGNORECASE)
return not regex.search(song_title)


class Genius(_API):
"""User-level interface with the Genius.com API. User can search for songs (getting lyrics) and artists (getting songs)"""
"""User-level interface with the Genius.com API. User can search for songs (getting lyrics) and artists (getting songs)"""

def search_song(self, song_title, artist_name="", take_first_result=False, verbose=True, remove_section_headers=False):
def search_song(self, song_title, artist_name="", take_first_result=False, verbose=True, remove_section_headers=False, remove_non_songs=True):
# TODO: Should search_song() be a @classmethod?
"""Search Genius.com for *song_title* by *artist_name*"""
"""Search Genius.com for *song_title* by *artist_name*"""

# Perform a Genius API search for the song
if verbose:
if artist_name != "":
print('Searching for "{0}" by {1}...'.format(song_title, artist_name))
else:
print('Searching for "{0}" by {1}...'.format(
song_title, artist_name))
else:
print('Searching for "{0}"...'.format(song_title))
search_term = "{} {}".format(song_title, artist_name)

json_search = self._make_api_request((search_term,'search'))

# Loop through search results, stopping as soon as title and artist of result match request
n_hits = min(10,len(json_search['hits']))

json_search = self._make_api_request((search_term, 'search'))

# Loop through search results, stopping as soon as title and artist of
# result match request
n_hits = min(10, len(json_search['hits']))
for i in range(n_hits):
search_hit = json_search['hits'][i]['result']
found_song = self._clean(search_hit['title'])
found_artist = self._clean(search_hit['primary_artist']['name'])

found_song = self._clean_str(search_hit['title'])
found_artist = self._clean_str(
search_hit['primary_artist']['name'])

# Download song from Genius.com if title and artist match the request
if take_first_result or found_song == self._clean(song_title) and found_artist == self._clean(artist_name) or artist_name == "":

# Found correct song, accessing API ID
json_song = self._make_api_request((search_hit['id'],'song'))

# Scrape the song's HTML for lyrics
lyrics = self._scrape_song_lyrics_from_url(json_song['song']['url'], remove_section_headers)

# Create the Song object
song = Song(json_song, lyrics)
if take_first_result or found_song == self._clean_str(song_title) and found_artist == self._clean_str(artist_name) or artist_name == "":

# Remove non-song results (e.g. Linear Notes, Tracklists, etc.)
song_is_valid = self._result_is_lyrics(found_song) if remove_non_songs else True
if song_is_valid:
# Found correct song, accessing API ID
json_song = self._make_api_request((search_hit['id'],'song'))

if verbose:
print('Done.')
return song
# Scrape the song's HTML for lyrics
lyrics = self._scrape_song_lyrics_from_url(json_song['song']['url'], remove_section_headers)

# Create the Song object
song = Song(json_song, lyrics)

if verbose:
print('Done.')
return song
else:
if verbose:
print('Specified song does not contain lyrics. Rejecting.')
return None

if verbose:
print('Specified song was not first result :(')
return None

def search_artist(self, artist_name, verbose=True, max_songs=None, take_first_result=False, get_full_song_info=True, remove_section_headers=False):
def search_artist(self, artist_name, verbose=True, max_songs=None, take_first_result=False, get_full_song_info=True, remove_section_headers=False, remove_non_songs=True):
"""Allow user to search for an artist on the Genius.com database by supplying an artist name.
Returns an Artist() object containing all songs for that particular artist."""

Expand All @@ -166,7 +195,7 @@ def search_artist(self, artist_name, verbose=True, max_songs=None, take_first_re
if first_result is None:
first_result = found_artist
artist_id = found_artist['id']
if take_first_result or self._clean(found_artist['name'].lower()) == self._clean(artist_name.lower()):
if take_first_result or self._clean_str(found_artist['name'].lower()) == self._clean_str(artist_name.lower()):
artist_name = found_artist['name']
break
else:
Expand Down Expand Up @@ -199,26 +228,37 @@ def search_artist(self, artist_name, verbose=True, max_songs=None, take_first_re
while keep_searching:
for json_song in artist_search_results['songs']:
# TODO: Shouldn't I use self.search_song() here?
# Scrape song lyrics from the song's HTML
lyrics = self._scrape_song_lyrics_from_url(json_song['url'], remove_section_headers)

# Create song object for current song
if get_full_song_info:
song = Song(self._make_api_request((json_song['id'], 'song')), lyrics)
else:
song = Song({'song':json_song}, lyrics) # Faster, less info from API

# Add song to the Artist object
if artist.add_song(song, verbose=False)==0:
n += 1

# Songs must have a title
if 'title' not in json_song:
json_song['title'] = 'MISSING TITLE'

# Remove non-song results (e.g. Linear Notes, Tracklists, etc.)
song_is_valid = self._result_is_lyrics(json_song['title']) if remove_non_songs else True

if song_is_valid:
# Scrape song lyrics from the song's HTML
lyrics = self._scrape_song_lyrics_from_url(json_song['url'], remove_section_headers)

# Create song object for current song
if get_full_song_info:
song = Song(self._make_api_request((json_song['id'], 'song')), lyrics)
else:
song = Song({'song':json_song}, lyrics) # Faster, less info from API

# Add song to the Artist object
if artist.add_song(song, verbose=False) == 0:
# print("Add song: {}".format(song.title))
n += 1
if verbose:
print('Song {0}: "{1}"'.format(n, song.title))

else: # Song does not contain lyrics
if verbose:
try:
print('Song {0}: "{1}"'.format(n,song.title))
except:
pass
print('"{title}" does not contain lyrics. Rejecting.'.format(title=json_song['title']))

# Check if user specified a max number of songs for the artist
if not isinstance(max_songs,type(None)):
if not isinstance(max_songs, type(None)):
if artist.num_songs >= max_songs:
keep_searching = False
if verbose:
Expand Down
2 changes: 1 addition & 1 deletion tests/test_genius.py
Original file line number Diff line number Diff line change
Expand Up @@ -101,7 +101,7 @@ def test_song(self):

def test_title(self):
msg = "The returned song title does not match the title of the requested song."
self.assertEqual(api._clean(self.song.title), api._clean(self.song_title), msg)
self.assertEqual(api._clean_str(self.song.title), api._clean_str(self.song_title), msg)

def test_artist(self):
msg = "The returned artist name does not match the artist of the requested song."
Expand Down

0 comments on commit 479a103

Please sign in to comment.