csv2libfm3.py

#!/usr/bin/python
# -*- coding:utf-8 -*-

# This is inspired from Steffen's Post regarding his approach

def sparseform(l,offset):
	out = []
	for i in xrange(len(l)):
		if l[i]!='0':
			out.append(str(offset+1+i)+':'+str(l[i]))
	return ' '.join(out)

avgword = [22.4717711600071, 24637.1264824473, 0.000202872334130734, 0.00489429506090396, 0.00168214977050067, 0.193388052510122, 0.168333319244977, 0.106313556098427, 0.00201181731346312, 0.523173937667475, 0.716764862511729, 0.000380385626495127, 0.00595092180116821, 0.000600163988470089, 0.096601043101918, 0.0129753763704449, 0.0363310538372457, 0.130396192762529, 34.7083009918459, 0.0349164584166808, 0.0187725008490618, 0.0515167651251174, 0.0933043676722936, 0.0220272236749879, 0.0396736829776393, 0.000639605217959256, 0.0297552622332295, 0.0209320325819038, 0.00555446636648828, 0.0654646619584374, 0.0898132729224605, 0.0137901526368622, 0.048486951518646, 0.145609861782486, 0.0661989438476249, 0.145808538084549, 0.0024742622905266, 0.101577501406698, 0.164029331884427, 0.153920930507773, 0.0363881096983869, 0.0921040396953534, 0.0368147187724036, 0.0182852771036537, 0.0352929186053027, 0.146285361209502, 0.00213762796528488, 0.0469770116229619, 0.0665068921158236, 0.106888361045131, 0.103025333682725, 0.120184951944616, 0.0958402718489277, 0.00791314127819944, 0.14099627222086, 0.206685315572159, 0.0714638658523129, 0.040884666806738, 0.0437485217144267, 0.0662690922662844, 0.003534660415038, 0.0782326438491644, 0.0135635854831379, 0.0573201422802906, 0.0230761390880825, 0.0866095806459793, 0.0829473561115948, 0.144208417511264, 0.151875301138621, 0.00154851789611188, 0.191638278628245, 0.0887989112518068, 0.0149624698632621, 0.0401349101021969, 0.0734506288729499, 0.10660862345421, 0.0160492038695778, 0.134681870821041, 0.0297398370065663, 0.0402135659108497, 0.00375347272644512, 0.088739740154352, 0.00292871862960291, 0.0381738108722665, 0.0228743032576158, 0.0219346723150089, 0.0313213190203423, 0.0571790397339329, 0.0497380182291643, 0.0100919570095142, 0.0324546768992819, 0.00852611193895695, 0.0810499974268865, 0.0676387458997592, 0.0044772365257148, 0.0190033882923929, 0.099197849049883, 0.0568313562053214, 0.0354041170277513, 0.0920364155839765, 0.0288601402127466]

worddata = [sparseform(avgword[2:],184+50928)]*(50*50928)

wordfile = open("newwords.csv","r")
wordfile.readline()


s = wordfile.readline()
while s!='':
	l = s[:-1].split(',')
	worddata[int(l[0])+int(l[1])*50] = sparseform(l[2:],184+50928)
	s = wordfile.readline()

def train2libfm(name):
	f = open(name,"r")
	g = open(name+".libfm3","w")
	f.readline()
	l = f.readlines()
	for line in l:
		(artist,track,user,rating,time) = line[:-1].split(',')
		g.write(' '.join([rating,str(int(track))+':1',str(int(user)+184)+':1',worddata[int(artist)+50*int(user)]])+'\n')
	g.close()

def test2libfm(name):
	f = open(name,"r")
	g = open(name+".libfm3","w")
	f.readline()
	l = f.readlines()
	for line in l:
		(artist,track,user,time) = line[:-1].split(',')
		g.write(' '.join(["0",str(int(track))+':1',str(int(user)+184)+':1',worddata[int(artist)+50*int(user)]])+'\n')
	g.close()

if __name__ == '__main__':
	train2libfm("train.csv")
	test2libfm("test.csv")