In [1]:
%matplotlib inline 
#above allows plots to discplay on the screen. 
#CSE392 Answer Key

#python includes
import sys

#standard probability includes:
import numpy as np #matrices and data structures
import scipy.stats as ss #standard statistical operations
import pandas as pd #keeps data organized, works well with data
import matplotlib
import matplotlib.pyplot as plt #plot visualization

Step 1: Tokenization (20 points).

In [2]:
import re
wordRE = re.compile(r'((?:[A-Z]\.)+|(?:[\.,!?;"])|(?:(?:\#|\@)?[A-Za-z0-9_\-]+(?:\'[a-z]{1,3})?))', re.UNICODE)
def tokenize(sent):
    #input: a single sentence as a string.
    #output: a list of each “word” in the text
    # must use regular expressions
    
    #<FILL IN>
    tokens = wordRE.findall(sent)
    #print("TOKENS", tokens)
       
    
    return tokens

Step 2: Feature Extraction (20 points).

In [3]:
def getFeaturesForTarget(tokens, targetI, wordToIndex):
    #input: tokens: a list of tokens, 
    #       targetI: index for the target token
    #       wordToIndex: dict mapping ‘word’ to an index in the feature list. 
    #output: list (or np.array) of k feature values for the given target
    
    #is the word capitalized
    wordCap = np.array([1 if tokens[targetI][0].isupper() else 0])
    oovIndex = len(wordToIndex)
    
    #first letter of the target word
    letterVec = np.zeros(257)
    val = ord(tokens[targetI][0])
    if val < 256:
        letterVec[val] = 1
    else:
        letterVec[256] = 1
        
    #length of the word:
    length = np.array([len(tokens[targetI])])
    
    #previousWord:
    prevVec = np.zeros(len(wordToIndex)+1)#+1 for OOV
    if targetI > 0:
        try:
            prevVec[wordToIndex[tokens[targetI - 1]]] = 1
        except KeyError:
            prevVec[oovIndex] = 1
            pass#no features added
        
    #targetWord:
    targetVec = np.zeros(len(wordToIndex)+1)
    try:
        targetVec[wordToIndex[tokens[targetI]]] = 1
    except KeyError:
        targetVec[oovIndex] = 1
        #print("unable to find wordIndex for '", tokens[targetI], "' skipping")
        pass
    
    #nextWord
    nextVec = np.zeros(len(wordToIndex)+1)
    if targetI+1 < len(tokens) :
        try:
            nextVec[wordToIndex[tokens[targetI + 1]]] = 1
        except KeyError:
            nextVec[oovIndex] = 1
            pass
        
    featureVector = np.concatenate((wordCap, letterVec, length, prevVec,\
                                    targetVec, nextVec))
    
    return featureVector

Step 3: Train Model

In [4]:
from sklearn.linear_model import LogisticRegression
from sklearn.model_selection import train_test_split
def trainTagger(features, tags):
    #inputs: features: feature vectors (i.e. X)
    #        tags: tags that correspond to each feature vector (i.e. y)
    #output: model -- a trained (i.e. fit) sklearn.lienear_model.LogisticRegression model
    #print(features[:3], tags[:3])
    
    #train different models and pick the best according to a development set:
    Cs = [.001, .01, .1, 1, 10, 100, 1000, 10000]
    penalties = ['l1', 'l2']
    X_train, X_dev, y_train, y_dev = train_test_split(features, tags,
                                                    test_size=0.10,
                                                    random_state=42)
    bestAcc = 0.0
    bestModel = None
    for pen in penalties: #l1 or l2
        for c in Cs: #c values:
            model = LogisticRegression(random_state=42, penalty=pen, multi_class='auto',\
                                       solver='liblinear', C = c)
            model.fit(X_train, y_train)
            modelAcc = metrics.accuracy_score(y_dev, model.predict(X_dev))
            if modelAcc > bestAcc:
                bestModel = model
                bestAcc = modelAcc
    
    print("Chosen Best Model: \n", bestModel, "\nACC: %.3f"%bestAcc)
    
    return bestModel

Step 4: Apply to new data

In [5]:
from sklearn import metrics
from collections import Counter
def testAndPrintAcurracies(tagger, features, true_tags):
    #inputs: tagger: an sklearn LogisticRegression object to perform tagging
    #        features: feature vectors (i.e. X)
    #        true_tags: tags that correspond to each feature vector (i.e. y)     
    
    pred_tags = tagger.predict(features)
    print("\nModel Accuracy: %.3f" % metrics.accuracy_score(true_tags, pred_tags))
    #most Frequent Tag: 
    mfTags = [Counter(true_tags).most_common(1)[0][0]]*len(true_tags) 
    print("MostFreqTag Accuracy: %.3f" % metrics.accuracy_score(true_tags, mfTags))
    
    return
    

Given Methods

In [6]:
def getConllTags(filename):
    #input: filename for a conll style parts of speech tagged file
    #output: a list of list of tuples [sent]. representing [[[word1, tag], [word2, tag2]]
    wordTagsPerSent = [[]]
    sentNum = 0
    with open(filename, encoding='utf8') as f:
        for wordtag in f: 
            wordtag=wordtag.strip()
            if wordtag:#still reading current sentence
                (word, tag) = wordtag.split("\t")
                wordTagsPerSent[sentNum].append((word,tag))
            else:#new sentence
                wordTagsPerSent.append([])
                sentNum+=1
    return wordTagsPerSent  

Main

In [7]:
from sys import argv

corpus = 'daily547.conll'
sampleSentences = \
    ['The horse raced past the barn fell.',
     'For 4 years, we attended S.B.U. in the CS program.',
     'Did you hear Sam tell me to "chill out" yesterday? #rude']

if __name__ == "__main__":
        
    if len(argv) > 1:#replace with argument for filename if available
        try:
            get_ipython()
        except: #not in python notebook; use argv
            corpus = argv[1]
    
    ###########################################
    #1) Test The Tokenizer
    for sent in sampleSentences:
        print(sent, "\n", tokenize(sent), "\n")
    
    ###########################################
    #2) Run Feature Extraction:
    #2a) load training data: 
    wordToIndex = set()
    tagToNum = set()
    taggedSents = getConllTags(corpus)
    for sent in taggedSents:
        if sent: 
            words, tags = zip(*sent)
            wordToIndex |= set(words) #union of the words into the set
            tagToNum |= set(tags) #union of all the tags into the set
    print("[Read ", len(taggedSents), " Sentences]")
    #make dictionaries for converting words to index and tags to ids:
    wordToIndex = {w: i for i, w in enumerate(wordToIndex)} 
    numToTag = list(tagToNum) #mapping index to tag
    tagToNum = {numToTag[i]: i for i in range(len(numToTag))}
    
    #2b) Call feature extraction on each target
    X = []
    y = []
    print("[Extracting Features]")
    for sent in taggedSents:
        if sent: 
            words, tags = zip(*sent)
            for i in range(len(words)):
                y.append(tagToNum[tags[i]]) #append y with class label
                X.append(getFeaturesForTarget(words, i, wordToIndex))
    X, y = np.array(X), np.array(y)
    print("[Done X is ", X.shape, " y is ", y.shape, "]")
    from sklearn.model_selection import train_test_split
    X_train, X_test, y_train, y_test = train_test_split(X, y,
                                                    test_size=0.10,
                                                    random_state=42)
    print("[Broke into training/test. X_train is ", X_train.shape, "]")
    
    ####################################################
    #3 Train the model. 
    print("[Training the model]")
    tagger = trainTagger(X_train, y_train)
    print("[done]")
    
    ###################################################
    #4 Test the tagger.
    testAndPrintAcurracies(tagger, X_test, y_test)
    
    ###################################################
    #5 Apply to example sentences:
    print("\n[Applying to sample sentences]")
    for sent in sampleSentences:
        tokens = tokenize(sent)
        sentX = []from sklearn.linear_model import LogisticRegression
from sklearn.model_selection import train_test_split
def trainTagger(features, tags):
    #inputs: features: feature vectors (i.e. X)
    #        tags: tags that correspond to each feature vector (i.e. y)
    #output: model -- a trained (i.e. fit) sklearn.lienear_model.LogisticRegression model
    #print(features[:3], tags[:3])
    
    #train different models and pick the best according to a development set:
    Cs = [.001, .01, .1, 1, 10, 100, 1000, 10000]
    penalties = ['l1', 'l2']
    X_train, X_dev, y_train, y_dev = train_test_split(features, tags,
                                                    test_size=0.10,
                                                    random_state=42)
    bestAcc = 0.0
    bestModel = None
    for pen in penalties: #l1 or l2
        for c in Cs: #c values:
            model = LogisticRegression(random_state=42, penalty=pen, multi_class='auto',\
                                       solver='liblinear', C = c)
            model.fit(X_train, y_train)
            modelAcc = metrics.accuracy_score(y_dev, model.predict(X_dev))
            if modelAcc > bestAcc:
                bestModel = model
                bestAcc = modelAcc
    
    print("Chosen Best Model: \n", bestModel, "\nACC: %.3f"%bestAcc)
    
    return bestModel
        for i in range(len(tokens)):
            sentX.append(getFeaturesForTarget(tokens, i, wordToIndex))
        pred_tags = tagger.predict(sentX)
        sentWithTags = zip(tokens, [numToTag[pt] for pt in pred_tags])
        print(sent, "\n  predicted tags: ", list(sentWithTags))
    
    
The horse raced past the barn fell. 
 ['The', 'horse', 'raced', 'past', 'the', 'barn', 'fell', '.'] 

For 4 years, we attended S.B.U. in the CS program. 
 ['For', '4', 'years', ',', 'we', 'attended', 'S.B.U.', 'in', 'the', 'CS', 'program', '.'] 

Did you hear Sam tell me to "chill out" yesterday? #rude 
 ['Did', 'you', 'hear', 'Sam', 'tell', 'me', 'to', '"', 'chill', 'out', '"', 'yesterday', '?', '#rude'] 

[Read  548  Sentences]
[Extracting Features]
[Done X is  (7707, 9694)  y is  (7707,) ]
[Broke into training/test. X_train is  (6936, 9694) ]
[Training the model]
Chosen Best Model: 
 LogisticRegression(C=10000, class_weight=None, dual=False, fit_intercept=True,
          intercept_scaling=1, max_iter=100, multi_class='auto',
          n_jobs=None, penalty='l2', random_state=42, solver='liblinear',
          tol=0.0001, verbose=0, warm_start=False) 
ACC: 0.781
[done]

Model Accuracy: 0.817
MostFreqTag Accuracy: 0.173

[Applying to sample sentences]
The horse raced past the barn fell. 
  predicted tags:  [('The', 'D'), ('horse', 'N'), ('raced', 'N'), ('past', 'V'), ('the', 'D'), ('barn', 'N'), ('fell', 'N'), ('.', ',')]
For 4 years, we attended S.B.U. in the CS program. 
  predicted tags:  [('For', 'P'), ('4', '$'), ('years', 'N'), (',', ','), ('we', 'O'), ('attended', 'V'), ('S.B.U.', 'N'), ('in', 'P'), ('the', 'D'), ('CS', '^'), ('program', 'N'), ('.', ',')]
Did you hear Sam tell me to "chill out" yesterday? #rude 
  predicted tags:  [('Did', 'V'), ('you', 'O'), ('hear', 'V'), ('Sam', 'N'), ('tell', 'V'), ('me', 'O'), ('to', 'P'), ('"', ','), ('chill', 'V'), ('out', 'P'), ('"', ','), ('yesterday', 'R'), ('?', ','), ('#rude', '#')]