-
Notifications
You must be signed in to change notification settings - Fork 0
/
search.py
127 lines (104 loc) · 5.13 KB
/
search.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
import aiohttp
import asyncio
import collections
import re
from nltk.corpus import stopwords
from nltk.stem import PorterStemmer
from nltk.tokenize import word_tokenize
from sentence_transformers import CrossEncoder, util
import urllib.parse
from flask import Flask, request, jsonify, redirect, url_for
app = Flask(__name__)
cross_enc = CrossEncoder("cross-encoder/ms-marco-MiniLM-L-4-v2", max_length=256, device="cuda:0")
stemmer = PorterStemmer()
stop_words = set(stopwords.words('english'))
def get_keywords(description):
# tokenize, filter stop words, filter non-alphanumeric, stem, quote unstemmed
tokens = word_tokenize(description)
tokens = [t for t in tokens if t.lower() not in stop_words]
tokens = map(stemmer.stem, tokens)
tokens = filter(str.isalnum, tokens)
return list(tokens)
async def query_hn(session, query_str, tags, hits_per_page=50):
params = {"query": query_str, "hitsPerPage": hits_per_page, "tags": tags}
async with session.get("http://hn.algolia.com/api/v1/search", params=params) as response:
return await response.json()
async def search_hn(description):
keywords = get_keywords(description)
async with aiohttp.ClientSession() as session:
coroutines = [query_hn(session, kw, tags="comment") for kw in keywords]
for coroutine in asyncio.as_completed(coroutines):
response = await coroutine
yield response["hits"]
async def get_best_submissions(desc, n=500):
results = collections.defaultdict(dict)
async for hits in search_hn(desc):
comment_scores = cross_enc.predict([[desc, h['comment_text']] for h in hits])
title_scores = cross_enc.predict([[desc, h['story_title']] for h in hits])
for hit, comment_score, title_score in zip(hits, comment_scores, title_scores):
story_dict = results[hit["story_id"]]
story_dict["title"] = hit["story_title"]
story_dict["title_score"] = float(title_score)
story_dict["comments"] = story_dict.get("comments", {})
story_dict["comments"][hit["objectID"]] = {
"comment_score": float(comment_score),
# hacky strip html and truncate
"comment_text": re.sub('<[^<]+?>', '', hit["comment_text"])[:256] + "...",
}
for sid, res in results.items():
comm_factor = 1 / (4 * len(res["comments"]))**0.5
best_comm_score = max([c["comment_score"] for c in res["comments"].values()])
results[sid]["score"] = (best_comm_score * comm_factor + res["title_score"]) / (1 + comm_factor)
best_results = sorted(results.items(), key=lambda item: item[1]["score"], reverse=True)
return best_results[:n]
@app.route('/', methods=['GET'])
async def index():
# Serving a simple form with JavaScript to handle form submission and display results
return '''
<html>
<body>
<form id="searchForm">
<label for="desc">HN profile description:</label><br>
<textarea id="desc" name="desc" rows="4" cols="50"></textarea><br>
<input type="button" value="Search" onclick="search()">
</form>
<div id="results"></div>
<script>
function search() {
const desc = document.getElementById('desc').value;
document.getElementById('results').innerHTML = '<p>Loading results...</p>';
fetch('/best_submissions/?desc=' + encodeURIComponent(desc))
.then(response => response.json())
.then(data => {
const submissionsHTML = data.map(([id, {score, title, title_score, comments}]) => {
const sortedComments = Object.entries(comments).sort(([,a], [,b]) => b.comment_score - a.comment_score).slice(0, 3);
return `
<li>
<a href="https://news.ycombinator.com/item?id=${id}">${title}</a><br>
<small><i>(Overall Score: ${score.toFixed(2)})</i></small>
<small><i>(Title Score: ${title_score.toFixed(2)})</i></small><br>
<ul>
${sortedComments.map(([comment_id, {comment_score, comment_text}]) => `
<li>
<a href="https://news.ycombinator.com/item?id=${comment_id}">${comment_text}</a>
<small><i>(Comment Score: ${comment_score.toFixed(2)})</i></small>
</li>
`).join('')}
</ul>
</li>
`;
}).join('<hr/>');
document.getElementById('results').innerHTML = `<ul>${submissionsHTML}</ul>`;
});
}
</script>
</body>
</html>
'''
@app.route('/best_submissions/')
async def best_submissions():
desc = request.args.get('desc')
submissions = await get_best_submissions(desc.strip())
return jsonify(submissions)
if __name__ == '__main__':
app.run(debug=True, host="0.0.0.0", port=8081)