Revision | 2c007f732b7bab7aa98c765d88647b0014c2bdcf (tree) |
---|---|
Zeit | 2015-03-26 01:36:49 |
Autor | Lorenzo Isella <lorenzo.isella@gmai...> |
Commiter | Lorenzo Isella |
A simple script to convert the test and train datasets (without the target values!) to a numerical matrix based on the term frequency–inverse document frequency.
@@ -0,0 +1,43 @@ | ||
1 | +#! /usr/bin/env python | |
2 | + | |
3 | + | |
4 | + | |
5 | +import pandas as pd | |
6 | +import numpy as np | |
7 | +from sklearn import ensemble, feature_extraction, preprocessing | |
8 | + | |
9 | + | |
10 | +# import data | |
11 | +train = pd.read_csv('train.csv') | |
12 | +test = pd.read_csv('test.csv') | |
13 | +#sample = pd.read_csv('sampleSubmission.csv') | |
14 | + | |
15 | +# drop ids and get labels | |
16 | +labels = train.target.values | |
17 | +#labels2=np.copy(labels) | |
18 | +train = train.drop('id', axis=1) | |
19 | +train = train.drop('target', axis=1) | |
20 | +test = test.drop('id', axis=1) | |
21 | + | |
22 | +# transform counts to TFIDF features | |
23 | +tfidf = feature_extraction.text.TfidfTransformer() | |
24 | +train = tfidf.fit_transform(train).toarray() | |
25 | +test = tfidf.transform(test).toarray() | |
26 | + | |
27 | +#labels=labels.reshape(-1,1) | |
28 | + | |
29 | +# train=np.hstack((train,labels)) | |
30 | + | |
31 | +# train=pd.DataFrame(train) | |
32 | +# test=pd.DataFrame(test) | |
33 | + | |
34 | + | |
35 | +np.savetxt("train-tfidf.csv", train, delimiter=",") | |
36 | +np.savetxt("test-tfidf.csv", test, delimiter=",") | |
37 | + | |
38 | +# train.to_csv("train-tfidf.csv", train) | |
39 | +# test.to_csv("test-tfidf.csv", test) | |
40 | + | |
41 | + | |
42 | + | |
43 | +print "So far so good" |