In [1]:
%matplotlib inline
#above allows plots to discplay on the screen.

#python includes
import sys

#standard probability includes:
import numpy as np #matrices and data structures
import scipy.stats as ss #standard statistical operations
import pandas as pd #keeps data organized, works well with data
import matplotlib
import matplotlib.pyplot as plt #plot visualization


## Step 1: Tokenization (20 points).¶

In [2]:
import re
wordRE = re.compile(r'((?:[A-Z]\.)+|(?:[\.,!?;"])|(?:(?:\#|\@)?[A-Za-z0-9_\-]+(?:\'[a-z]{1,3})?))', re.UNICODE)
def tokenize(sent):
#input: a single sentence as a string.
#output: a list of each “word” in the text
# must use regular expressions

#<FILL IN>
tokens = wordRE.findall(sent)
#print("TOKENS", tokens)



## Step 2: Feature Extraction (20 points).¶

In [3]:
def getFeaturesForTarget(tokens, targetI, wordToIndex):
#input: tokens: a list of tokens,
#       targetI: index for the target token
#       wordToIndex: dict mapping ‘word’ to an index in the feature list.
#output: list (or np.array) of k feature values for the given target

#is the word capitalized
wordCap = np.array([1 if tokens[targetI][0].isupper() else 0])
oovIndex = len(wordToIndex)

#first letter of the target word
letterVec = np.zeros(257)
val = ord(tokens[targetI][0])
if val < 256:
letterVec[val] = 1
else:
letterVec[256] = 1

#length of the word:
length = np.array([len(tokens[targetI])])

#previousWord:
prevVec = np.zeros(len(wordToIndex)+1)#+1 for OOV
if targetI > 0:
try:
prevVec[wordToIndex[tokens[targetI - 1]]] = 1
except KeyError:
prevVec[oovIndex] = 1

#targetWord:
targetVec = np.zeros(len(wordToIndex)+1)
try:
targetVec[wordToIndex[tokens[targetI]]] = 1
except KeyError:
targetVec[oovIndex] = 1
#print("unable to find wordIndex for '", tokens[targetI], "' skipping")
pass

#nextWord
nextVec = np.zeros(len(wordToIndex)+1)
if targetI+1 < len(tokens) :
try:
nextVec[wordToIndex[tokens[targetI + 1]]] = 1
except KeyError:
nextVec[oovIndex] = 1
pass

featureVector = np.concatenate((wordCap, letterVec, length, prevVec,\
targetVec, nextVec))

return featureVector


## Step 3: Train Model¶

In [4]:
from sklearn.linear_model import LogisticRegression
from sklearn.model_selection import train_test_split
def trainTagger(features, tags):
#inputs: features: feature vectors (i.e. X)
#        tags: tags that correspond to each feature vector (i.e. y)
#output: model -- a trained (i.e. fit) sklearn.lienear_model.LogisticRegression model
#print(features[:3], tags[:3])

#train different models and pick the best according to a development set:
Cs = [.001, .01, .1, 1, 10, 100, 1000, 10000]
penalties = ['l1', 'l2']
X_train, X_dev, y_train, y_dev = train_test_split(features, tags,
test_size=0.10,
random_state=42)
bestAcc = 0.0
bestModel = None
for pen in penalties: #l1 or l2
for c in Cs: #c values:
model = LogisticRegression(random_state=42, penalty=pen, multi_class='auto',\
solver='liblinear', C = c)
model.fit(X_train, y_train)
modelAcc = metrics.accuracy_score(y_dev, model.predict(X_dev))
if modelAcc > bestAcc:
bestModel = model
bestAcc = modelAcc

print("Chosen Best Model: \n", bestModel, "\nACC: %.3f"%bestAcc)

return bestModel


## Step 4: Apply to new data¶

In [5]:
from sklearn import metrics
from collections import Counter
def testAndPrintAcurracies(tagger, features, true_tags):
#inputs: tagger: an sklearn LogisticRegression object to perform tagging
#        features: feature vectors (i.e. X)
#        true_tags: tags that correspond to each feature vector (i.e. y)

pred_tags = tagger.predict(features)
print("\nModel Accuracy: %.3f" % metrics.accuracy_score(true_tags, pred_tags))
#most Frequent Tag:
mfTags = [Counter(true_tags).most_common(1)[0][0]]*len(true_tags)
print("MostFreqTag Accuracy: %.3f" % metrics.accuracy_score(true_tags, mfTags))

return



## Given Methods¶

In [6]:
def getConllTags(filename):
#input: filename for a conll style parts of speech tagged file
#output: a list of list of tuples [sent]. representing [[[word1, tag], [word2, tag2]]
wordTagsPerSent = [[]]
sentNum = 0
with open(filename, encoding='utf8') as f:
for wordtag in f:
wordtag=wordtag.strip()
(word, tag) = wordtag.split("\t")
wordTagsPerSent[sentNum].append((word,tag))
else:#new sentence
wordTagsPerSent.append([])
sentNum+=1
return wordTagsPerSent


## Main¶

In [7]:
from sys import argv

corpus = 'daily547.conll'
sampleSentences = \
['The horse raced past the barn fell.',
'For 4 years, we attended S.B.U. in the CS program.',
'Did you hear Sam tell me to "chill out" yesterday? #rude']

if __name__ == "__main__":

if len(argv) > 1:#replace with argument for filename if available
try:
get_ipython()
except: #not in python notebook; use argv
corpus = argv[1]

###########################################
#1) Test The Tokenizer
for sent in sampleSentences:
print(sent, "\n", tokenize(sent), "\n")

###########################################
#2) Run Feature Extraction:
wordToIndex = set()
tagToNum = set()
taggedSents = getConllTags(corpus)
for sent in taggedSents:
if sent:
words, tags = zip(*sent)
wordToIndex |= set(words) #union of the words into the set
tagToNum |= set(tags) #union of all the tags into the set
#make dictionaries for converting words to index and tags to ids:
wordToIndex = {w: i for i, w in enumerate(wordToIndex)}
numToTag = list(tagToNum) #mapping index to tag
tagToNum = {numToTag[i]: i for i in range(len(numToTag))}

#2b) Call feature extraction on each target
X = []
y = []
print("[Extracting Features]")
for sent in taggedSents:
if sent:
words, tags = zip(*sent)
for i in range(len(words)):
y.append(tagToNum[tags[i]]) #append y with class label
X.append(getFeaturesForTarget(words, i, wordToIndex))
X, y = np.array(X), np.array(y)
print("[Done X is ", X.shape, " y is ", y.shape, "]")
from sklearn.model_selection import train_test_split
X_train, X_test, y_train, y_test = train_test_split(X, y,
test_size=0.10,
random_state=42)
print("[Broke into training/test. X_train is ", X_train.shape, "]")

####################################################
#3 Train the model.
print("[Training the model]")
tagger = trainTagger(X_train, y_train)
print("[done]")

###################################################
#4 Test the tagger.
testAndPrintAcurracies(tagger, X_test, y_test)

###################################################
#5 Apply to example sentences:
print("\n[Applying to sample sentences]")
for sent in sampleSentences:
tokens = tokenize(sent)
for i in range(len(tokens)):
sentX.append(getFeaturesForTarget(tokens, i, wordToIndex))
pred_tags = tagger.predict(sentX)
sentWithTags = zip(tokens, [numToTag[pt] for pt in pred_tags])
print(sent, "\n  predicted tags: ", list(sentWithTags))


The horse raced past the barn fell.
['The', 'horse', 'raced', 'past', 'the', 'barn', 'fell', '.']

For 4 years, we attended S.B.U. in the CS program.
['For', '4', 'years', ',', 'we', 'attended', 'S.B.U.', 'in', 'the', 'CS', 'program', '.']

Did you hear Sam tell me to "chill out" yesterday? #rude
['Did', 'you', 'hear', 'Sam', 'tell', 'me', 'to', '"', 'chill', 'out', '"', 'yesterday', '?', '#rude']

[Extracting Features]
[Done X is  (7707, 9694)  y is  (7707,) ]
[Broke into training/test. X_train is  (6936, 9694) ]
[Training the model]
Chosen Best Model:
LogisticRegression(C=10000, class_weight=None, dual=False, fit_intercept=True,
intercept_scaling=1, max_iter=100, multi_class='auto',
n_jobs=None, penalty='l2', random_state=42, solver='liblinear',
tol=0.0001, verbose=0, warm_start=False)
ACC: 0.781
[done]

Model Accuracy: 0.817
MostFreqTag Accuracy: 0.173

[Applying to sample sentences]
The horse raced past the barn fell.
predicted tags:  [('The', 'D'), ('horse', 'N'), ('raced', 'N'), ('past', 'V'), ('the', 'D'), ('barn', 'N'), ('fell', 'N'), ('.', ',')]
For 4 years, we attended S.B.U. in the CS program.
predicted tags:  [('For', 'P'), ('4', '\$'), ('years', 'N'), (',', ','), ('we', 'O'), ('attended', 'V'), ('S.B.U.', 'N'), ('in', 'P'), ('the', 'D'), ('CS', '^'), ('program', 'N'), ('.', ',')]
Did you hear Sam tell me to "chill out" yesterday? #rude
predicted tags:  [('Did', 'V'), ('you', 'O'), ('hear', 'V'), ('Sam', 'N'), ('tell', 'V'), ('me', 'O'), ('to', 'P'), ('"', ','), ('chill', 'V'), ('out', 'P'), ('"', ','), ('yesterday', 'R'), ('?', ','), ('#rude', '#')]