-
Notifications
You must be signed in to change notification settings - Fork 0
Expand file tree
/
Copy pathbest.py
More file actions
40 lines (28 loc) · 1.11 KB
/
best.py
File metadata and controls
40 lines (28 loc) · 1.11 KB
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
import sys
import nltk
from nltk.stem.porter import *
from sklearn import datasets
from sklearn.externals import joblib
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.feature_selection import SelectPercentile
from sklearn.feature_selection import chi2
from sklearn.pipeline import Pipeline
from sklearn.svm import LinearSVC
def stem_tokens(tokens, stemmer):
stemmed = []
for item in tokens:
stemmed.append(stemmer.stem(item))
return stemmed
def tokenize(text):
stemmer = PorterStemmer()
tokens = nltk.word_tokenize(text)
stems = stem_tokens(tokens, stemmer)
return stems
training_data = datasets.load_files(sys.argv[1], encoding="utf-8", decode_error='ignore')
bigram_tfidf_vectorizer = TfidfVectorizer(ngram_range=(1, 2),tokenizer=tokenize, stop_words='english')
selector = SelectPercentile(chi2, 25)
print("\nSVM\n")
clf = LinearSVC(penalty="l2",dual=False, C=5.0)
pipe_clf = Pipeline([('vectorizer', bigram_tfidf_vectorizer), ('selector',selector), ('classifier',clf )])
pipe_clf.fit(training_data.data, training_data.target)
joblib.dump(pipe_clf, sys.argv[2])