正文
),
"documents"
)
print
(
len
(
classes
),
"classes"
,
classes
)
print
(
len
(
words
),
"unique stemmed words"
,
words
)
12
documents
3
classes
[
'greeting'
,
'goodbye'
,
'sandwich'
]
26
unique stemmed
words
[
'sandwich'
,
'hav'
,
'a'
,
'how'
,
'for'
,
'ar'
,
'good'
,
'mak'
,
'me'
,
'it'
,
'day'
,
'soon'
,
'nic'
,
'lat'
,
'going'
,
'you'
,
'today'
,
'can'
,
'lunch'
,
'is'
,
"'s"
,
'see'
,
'to'
,
'talk'
,
'yo'
,
'what'
]
注意每个单词都是词根并且小写。词根有助于机器将“have”和“having”等同起来。同时我们也不关心大小写。
我们将训练集中的每个句子转换为词包。
# create our training data
training
=
[]
output
=
[]
# create an empty array for our output
output_empty
=
[
0
]
*
len
(
classes
)
# training set, bag of words for each sentence
for
doc
in
documents
:
# initialize our bag of words
bag
=
[]
# list of tokenized words for the pattern
pattern_words
=
doc
[
0
]
# stem each word
pattern_words
=
[
stemmer
.
stem
(
word
.
lower
())
for
word
in
pattern_words
]
# create our bag of words array
for
w
in
words
:
bag
.
append
(
1
)
if
w
in
pattern_words
else
bag
.
append
(
0
)
training
.
append
(
bag
)
# output is a '0' for each tag and '1' for current tag
output_row
=
list
(
output_empty
)
output_row
[
classes
.
index
(
doc
[
1
])]
=
1
output
.
append
(
output_row
)
# sample training/output
i
=
0
w
=
documents
[
i
][
0
]
print
([
stemmer
.
stem
(
word
.
lower
())
for
word
in
w
])
print
(
training
[
i
])
print
(
output
[
i
])
[
'how'
,
'ar'
,
'you'
,
'?'
]
[
0
,
0
,
0
,
1
,
0
,
1
,