-
Notifications
You must be signed in to change notification settings - Fork 1
/
makecorpus.py
55 lines (41 loc) · 1.64 KB
/
makecorpus.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
# prints all papers into file one per line with words separated by space, to be used with LDA
import os
from string import punctuation
from operator import itemgetteimport re
def makecorpus(relpath,outfname):
# load in stopwords (i.e. boring words, these we will ignore)
stopwords = open("stopwords.txt", "r").read().split()
stopwords = [x.strip(punctuation) for x in stopwords if len(x)>2]
# get list of all PDFs supplied by NIPS
allFiles = os.listdir(relpath)
pdfs = [x for x in allFiles if x.endswith(".pdf")]
# go over every PDF, use pdftotext to get all words, discard boring ones, and count frequencies
with open(outfname, "w") as outf:
for i,f in enumerate(pdfs):
paperid = f[9:-4]
fullpath = relpath + f
print ("processing %s, %d/%d" % (paperid, i, len(pdfs)))
# create text file
cmd = "pdftotext %s %s" % (fullpath, "out.txt")
print ("pdEXEC: " + cmd)
os.system(cmd)
txtlst = open("out.txt").read().split() # get all words in a giant list
words = [x.lower() for x in txtlst if re.match('^[\w-]+$', x) is not None] # take only alphanumerics
words = [x for x in words if len(x)>2 and (not x in stopwords)] # remove stop words
wcount = {}
for w in words: wcount[w] = wcount.get(w, 0) + 1
words = [x for x in words if wcount[x] >= 3] # only take words that occurr at least a bit (for efficiency)
outf.write(" ".join(words))
outf.write("\n")
if __name__ == "__main__":
import sys
year=2016
conference="cvpr"
argc=len(sys.argv)
if(argc>1):
year=int(sys.argv[1])
if(argc>2):
conference=sys.argv[2]
relpath= conference+str(year)+"/"
outfname=relpath+"/allpapers.txt"
makecorpus(relpath,outfname)