-
Notifications
You must be signed in to change notification settings - Fork 3
/
DTree.py
106 lines (92 loc) · 3.37 KB
/
DTree.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
import re
# import nltk
import numpy as np
# from keras.preprocessing.text import one_hot
from sklearn import tree
from sklearn import svm
from sklearn.svm import SVC
from sklearn.multiclass import OneVsRestClassifier
import read_html
# test Decision Tree and compare with SVM
# use two feature: title length and special character
def clean_sentence(s):
c = s.lower().strip()
return c
def count_special_character(sentences):
l = list(sentences)
count = 0
flag = 0
for ch in l:
if (ch == '!') or (ch == '?') or (ch == '|'):
count = count + 1
if ch == '.':
if flag == 0:
flag = 1
else:
count = count + 1
flag = 0
return count
def count_length(sentences):
return len(sentences)
def set_label(fake_size, real_size):
label = list()
for counter in range(0,fake_size):
label.append(0)
for counter in range(0,real_size):
label.append(1)
return label
train_news = list()
fake_size = 0
real_size = 0
with open('./data/fake_news_training.txt') as train1:
with open('./data/real_news_training.txt') as train2:
for line in train1:
special_character = count_special_character(clean_sentence(line))
line_length = count_length(clean_sentence(line))
train_news.append([special_character, line_length])
fake_size = fake_size+1
for line in train2:
special_character = count_special_character(clean_sentence(line))
line_length = count_length(clean_sentence(line))
train_news.append([special_character, line_length])
real_size = real_size+1
predict_news = list()
with open('./data/testing_real.txt') as predict1:
with open('./data/testing_fake.txt') as predict2:
for line in predict1:
special_character = count_special_character(clean_sentence(line))
line_length = count_length(clean_sentence(line))
predict_news.append([special_character, line_length])
for line in predict2:
special_character = count_special_character(clean_sentence(line))
line_length = count_length(clean_sentence(line))
predict_news.append([special_character, line_length])
labels = set_label(fake_size, real_size)
# print labels
# print train_news
print "test result with [R, R, R, R, F, F, F, F]"
print "Decision Tree Result: (1 for Real, 0 for fake)"
clf = tree.DecisionTreeClassifier()
clf = clf.fit(train_news, labels)
print clf.predict(predict_news)
print "SVM Result: (1 for Real, 0 for fake)"
classif = OneVsRestClassifier(estimator=SVC(random_state=0))
print classif.fit(train_news, labels).predict(predict_news)
print "trying list of news from internet!"
list_headlines = read_html.getFakeNews('data/test.txt')
test_news = []
for line in list_headlines:
special_character = count_special_character(clean_sentence(line))
line_length = count_length(clean_sentence(line))
test_news.append([special_character, line_length])
print "test result with [F, F, R]"
print "Decision Tree Result: (1 for Real, 0 for fake)"
clf = tree.DecisionTreeClassifier()
clf = clf.fit(train_news, labels)
print clf.predict(test_news)
# output:
# test result with [R, R, R, R, F, F, F, F]
# Decision Tree Result: (1 for Real, 0 for fake)
# [1 0 1 1 1 0 0 0]
# SVM Result: (1 for Real, 0 for fake)
# [1 0 1 0 1 0 0 0]