做文件的向量化流程如下
- 將文件分解成小字元(token)
- 經由一個字典對照表將token編碼成數值向量
- 把這些數值向量打包成序列張量送入深度學習網路
其中token轉換為向量有兩種主要的方法:
- token的one-hot encoding
- 文字嵌入法(token embedding)
#one-hot encoding:
#1-1. token的one-hot encoding
import numpy as np
samples=['I want to be an machine learning expert.','I need to study a lot.']
token_index={} #建立空字典儲存所有token和key
#建立token_index字典
for sample in samples:
for word in sample.split():
if word not in token_index:
token_index[word]=len(token_index)+1
#將字串的token轉成向量
max_length=10
results=np.zeros(shape=(len(samples),max_length, max(token_index.values())+1))
for i, sample in enumerate(samples):
for j, word in list(enumerate(sample.split()))[:max_length]:
index=token_index.get(word)
results[i,j,index]=1.
# 1-2. 字元的encoding
import string
samples=['I want to be an machine learning expert.','I need to study a lot.']
characters=string.printable #列出所有可以印出的字元
print(len(characters))
token_index=dict(zip(characters,range(1,len(characters)+1))) #把字串變成字典
max_length=50 #只翻譯樣本的前50個字元
results=np.zeros((len(samples),max_length,max(token_index.values())+1))
print(results.shape)
for i, sample in enumerate(samples):
for j, character in enumerate(sample):
index=token_index.get(character)
results[i,j,index]=1.
# 1-3. 使用keras內建工具來做one-hot encoding
from keras.preprocessing.text import Tokenizer
samples=['I want to be an machine learning expert.','I need to study a lot.']
tokenizer=Tokenizer(num_words=1000) #處理前1000個最常用單字
tokenizer.fit_on_texts(samples)
#將文件轉換成數字
sequences=tokenizer.texts_to_sequences(samples)
print(sequences)
#將文字轉換成向量
one_hot_results=tokenizer.texts_to_matrix(samples, mode='binary')#兩位元
print(one_hot_results)
#印出單字和索引對照表
word_index=tokenizer.word_index
print('found %s unique tokens.'%(word_index))