Skip to content
Open
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
Binary file added .DS_Store
Binary file not shown.
1 change: 1 addition & 0 deletions .gitignore
Original file line number Diff line number Diff line change
Expand Up @@ -13,3 +13,4 @@ conf_files/UW_english/UW_AMT_description_documents_per_image_nopreproc_stop_raw.
Validation/get_just_results.py
Validation/interval_run.py
Validation/macro-pos5DescrNegDocVecdistractorTest.py
.DS_Store
106 changes: 106 additions & 0 deletions GroundedLanguageLearning.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,106 @@
# Created by: Luke Richards
# Purpose: This file outlines a general framework for a more modular GLS. This code as of now will not run as it is pseudo-code written by Frank Ferraro

import torch
import torch.nn as nn
import torch.nn.functional as F
import torch.optim
import pandas as pd

class GroundedLanguageClassifier(nn.Module):
def __init__(self, text_encoder, percept_encoder, corr_scorer):

super(GroundedLanguageClassifier, self).__init__()

self.text_encoder = text_encoder
self.percept_encoder = percept_encoder
self.scoring = corr_scorer
def forward(self, text, percepts):
#changing rep with representation
text_rep = self.text_encoder(text)
percept_rep = self.percept_encoder(percepts)
correspond_score = self.scoring(text_rep, percept_rep)
return correspond_score

class EmptyExtractor(nn.Module): #Semantic parsing: Luke's research
def __init__(self):
super(EmptyExtractor, self).__init__()
pass
def forward(self, *args):
return torch.empty(0)

# Extract features from Luke RSS 2019 dataset
# from a csv file, read in file, forward a single vector for given image name
class CNNFeatureExtractor(nn.Module):
def __init__(self, filePath,num_features):
super(CNNFeatureExtractor, self).__init__()

self.data = pd.read_csv(filePath)
self.data.set_index('image_name', inplace=True)

def forward(self, image_name,*args):
return self.data.loc[image_name, :].values

class MultiLabelBinaryMLPScorer(nn.Module):
def __init__(self, input_size, num_classifiers,
num_layers=0, layer_size=[],
activation=nn.functional.tanh):

layer_size_list = self._get_layer_dims(num_layers, layer_size)
self.layers, prev_size = [], input_size
self.activation_fns = []
for l in layer_size_list:
self.activation_fns.append(activation)
self.layers.append(nn.Linear(prev_size, l))
prev_size = l
self.layers.append(nn.Linear(prev_size, num_classifiers))
self.activation_fns.append(nn.Sequential())

def _get_layer_dims(self, num_layers, layer_size):
if type(layer_size) == type([]):
if len(layer_size) == num_layers:
layer_size_list = layer_size
else:
raise Exception("bad")
elif type(layer_size) == type(0):
layer_size_list = [layer_size for _ in range(num_layers)]
return layer_size_list

def forward(self, text_feats, vis_feats):
logits = concat(text_feats, vis_feats)
for act_fn, layer in zip(self.activation_fns, self.layers):
logits = act_fn(layer(logits))
return nn.functional.logsigmoid(logits)

class GLSLearner:
def __init__(self, model, loss = nn.NLLLoss(), optim_factory = torch.optim.Adam):
self.model = model
self.loss = loss
self.optimizer = optim_factory(self.model.parameters())

def train(self, train_dataset, val_dataset, train_params):
iter_index = 0
## assume train_params has a function that allows us to train
## for a max number of iterations, or has some criteria based
## on the loss and/or its gradient
while train_params.continue_training(loss_val, iter_index):
train_batch = get_next_batch(train_dataset, iter_index)
predicted_values = self.model(train_batch.text, train_batch.percepts)
loss_score = self.loss(y_true=train_batch.labels,
y_pred=predicted_values)
loss_score.backward()
self.optimizer.step()
if train_params.do_val(iter_index):
val_batch = get_next_batch(val_dataset, iter_index)
predictions = self.predictions(val_batch)
self.evaluate(val_batch.labels, predictions)
iter_index += 1

def predict(self, dataset):
# did you mean test phase?
## get predictions
return predictions

def evaluate(self, y_true, y_pred):
## perform whatever evaluations
print("eval")
Binary file added OLD GLS/.DS_Store
Binary file not shown.
107 changes: 107 additions & 0 deletions OLD GLS/NegativeExampleDataGenerator.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,107 @@
import numpy as np
import math
import os
import sys
import collections
import pickle
from gensim.models.doc2vec import LabeledSentence
from gensim.models import Doc2Vec


######## Negative Example Generation##########
class LabeledLineSentence(object):
def __init__(self,docLists,docLabels):
self.docLists = docLists
self.docLabels = docLabels

def __iter__(self):
for index, arDoc in enumerate(self.docLists):
yield LabeledSentence(arDoc, [self.docLabels[index]])

def to_array(self):
self.sentences = []
for index, arDoc in enumerate(self.docLists):
self.sentences.append(LabeledSentence(arDoc, [self.docLabels[index]]))
return self.sentences

def sentences_perm(self):
from random import shuffle
shuffle(self.sentences)
return self.sentences


class NegSampleSelection:
# using __slots__ to not to use a dict for the sake of space and speed
__slots__ = ['docs']
docs = {}

def __init__(self, docs):
docs = collections.OrderedDict(sorted(docs.items()))
self.docs = docs

def sentenceToWordLists(self):
docLists = []
docs = self.docs
for key in docs.keys():
sent = docs[key]
wLists = sent.split(" ")
docLists.append(wLists)
return docLists

def square_rooted(self,x):
return round(math.sqrt(sum([a*a for a in x])),3)

def cosine_similarity(self,x,y):
numerator = sum(a*b for a,b in zip(x,y))
denominator = self.square_rooted(x)*self.square_rooted(y)
return round(numerator/float(denominator),3)

def generateNegatives(self):
docs = self.docs
docNames = docs.keys()
docLists = self.sentenceToWordLists()
docLabels = []
for key in docNames:
ar = key.split("/")
docLabels.append(ar[1])
sentences = LabeledLineSentence(docLists,docLabels)
model = Doc2Vec(min_count=1, window=10, size=2000, sample=1e-4, negative=5, workers=8)
model.build_vocab(sentences.to_array())
token_count = sum([len(sentence) for sentence in sentences])
for epoch in range(10):
model.train(sentences.sentences_perm(),total_examples = token_count,epochs=model.iter)
model.alpha -= 0.002 # decrease the learning rate
model.min_alpha = model.alpha # fix the learning rate, no deca
model.train(sentences.sentences_perm(),total_examples = token_count,epochs=model.iter)

degreeMap = {}
for i , item1 in enumerate(docLabels):
fDoc = model.docvecs[docLabels[i]]
cInstMap = {}
cInstance = docNames[i]
for j,item2 in enumerate(docLabels):
tDoc = model.docvecs[docLabels[j]]
cosineVal = max(-1.0,min(self.cosine_similarity(fDoc,tDoc),1.0))
try:
cValue = math.degrees(math.acos(cosineVal))
except:
print("ERROR: invalid cosine value")
print cosineVal
print fDoc
print tDoc
exit()
tInstance = docNames[j]
cInstMap[tInstance] = cValue
degreeMap[cInstance] = cInstMap
negInstances = {}
for k in np.sort(degreeMap.keys()):
v = degreeMap[k]
ss = sorted(v.items(), key=lambda x: x[1])
sentAngles = ""
for item in ss:
if item[0] != k:
sentAngles += item[0]+"-"+str(item[1])+","
sentAngles = sentAngles[:-1]
negInstances[k] = sentAngles
return negInstances
############Negative Example Generation --- END ########
33 changes: 33 additions & 0 deletions OLD GLS/README.md
Original file line number Diff line number Diff line change
@@ -0,0 +1,33 @@
# Grounded language Learning System

General system framework for learning word-as-classifer groundings

### Prerequisites

- python 2.7
- pandas
- genism

## Running the tests

Visual features should be stored in a folder one diectory above GLS


#### Preprocessing language input

```
python2 preprocess_descriptions.py <language name> <language> <"stop", "lemm", or "stemm">
```

#### Learning

```
python2 cLL-ML.py --resDir <folder for result output> --cat <category of learning: rgb, shape, object, all> --pre <formated language conf file> --cutoff <threshold for negative example selection> --seed <seed for random selection> --visfeat <location of visual feature folder hierarchy> --listof <list of instances or images conf file> --negexmpl <optional: import negative examples previously computed to save time>
```

#### Testing / Validation

```
python2 macro-pos5DescrNegDocVecdistractorTest.py <result folder>/NoOfDataPoints/ <category: rgb, shape, object, all> <category: rgb, shape, object, all> <formated language conf file>

```
File renamed without changes.
File renamed without changes.
File renamed without changes.
File renamed without changes.
File renamed without changes.
File renamed without changes.
File renamed without changes.
File renamed without changes.
File renamed without changes.
126 changes: 4 additions & 122 deletions cLL-ML.py → OLD GLS/cLL-ML.py
Original file line number Diff line number Diff line change
@@ -1,5 +1,6 @@
# -*- coding: utf-8 -*-
#!/usr/bin/env python
from NegativeExampleDataGenerator import *
import numpy as np
import codecs
from sklearn import preprocessing
Expand Down Expand Up @@ -111,128 +112,7 @@ def fileAppend(fName, sentence):
myfile.write(sentence)
myfile.write("\n")

######## Negative Example Generation##########
class LabeledLineSentence(object):
def __init__(self,docLists,docLabels):
self.docLists = docLists
self.docLabels = docLabels

def __iter__(self):
for index, arDoc in enumerate(self.docLists):
yield LabeledSentence(arDoc, [self.docLabels[index]])

def to_array(self):
self.sentences = []
for index, arDoc in enumerate(self.docLists):
self.sentences.append(LabeledSentence(arDoc, [self.docLabels[index]]))
return self.sentences

def sentences_perm(self):
from random import shuffle
shuffle(self.sentences)
return self.sentences

class NegSampleSelection:
""" Class to bundle negative example generation functions and variables. """
__slots__ = ['docs']
docs = {}
def __init__(self,docs):
"""""""""""""""""""""""""""""""""""""""""
Initialization function for NegSampleSelection class
Args: Documents dictionary where key is object instance and value
is object annotation
Returns: Nothing
"""""""""""""""""""""""""""""""""""""""""
docs = collections.OrderedDict(sorted(docs.items()))
self.docs = docs

def sentenceToWordLists(self):
docLists = []
docs = self.docs
for key in docs.keys():
sent = docs[key]
wLists = sent.split(" ")
docLists.append(wLists)
return docLists

def sentenceToWordDicts(self):
docs = self.docs
docDicts = {}
for key in docs.keys():
sent = docs[key]
wLists = sent.split(" ")
docDicts[key] = wLists
return docDicts

def square_rooted(self,x):
return round(math.sqrt(sum([a*a for a in x])),3)

def cosine_similarity(self,x,y):
numerator = sum(a*b for a,b in zip(x,y))
denominator = self.square_rooted(x)*self.square_rooted(y)
return round(numerator/float(denominator),3)

def generateNegatives(self):
docs = self.docs
docNames = docs.keys()
docLists = self.sentenceToWordLists()
docDicts = self.sentenceToWordDicts()
docLabels = []
for key in docNames:
ar = key.split("/")
docLabels.append(ar[1])
sentences = LabeledLineSentence(docLists,docLabels)
model = Doc2Vec(min_count=1, window=10, size=2000, sample=1e-4, negative=5, workers=8)

model.build_vocab(sentences.to_array())
token_count = sum([len(sentence) for sentence in sentences])
for epoch in range(10):
model.train(sentences.sentences_perm(),total_examples = token_count,epochs=model.iter)
model.alpha -= 0.002 # decrease the learning rate
model.min_alpha = model.alpha # fix the learning rate, no deca
model.train(sentences.sentences_perm(),total_examples = token_count,epochs=model.iter)

degreeMap = {}
for i , item1 in enumerate(docLabels):
fDoc = model.docvecs[docLabels[i]]
cInstMap = {}
cInstance = docNames[i]
for j,item2 in enumerate(docLabels):

tDoc = model.docvecs[docLabels[j]]
cosineVal = max(-1.0,min(self.cosine_similarity(fDoc,tDoc),1.0))


try:
cValue = math.degrees(math.acos(cosineVal))
except:
print("ERROR: invalid cosine value")
print cosineVal
print fDoc
print tDoc
exit()
tInstance = docNames[j]
cInstMap[tInstance] = cValue
degreeMap[cInstance] = cInstMap
negInstances = {}
for k in np.sort(degreeMap.keys()):
v = degreeMap[k]
ss = sorted(v.items(), key=lambda x: x[1])
sentAngles = ""
for item in ss:
if item[0] != k:
sentAngles += item[0]+"-"+str(item[1])+","
sentAngles = sentAngles[:-1]
negInstances[k] = sentAngles

# pickle negative examples for later use

with open('NegExamples_'+ resultDir + '.pickle', 'wb') as handle:
pickle.dump(negInstances, handle, protocol=pickle.HIGHEST_PROTOCOL)

return negInstances

############Negative Example Generation --- END ########


class Category:
""" Class to bundle our dataset functions and variables category wise. """
Expand Down Expand Up @@ -610,6 +490,8 @@ def getDataSet(self,cDf,nDf,tests,fName):
negExamples = pickle.load(handle)
else:
negExamples = negSelection.generateNegatives()
with open('NegExamples_RECENT.pickle', 'wb') as handle:
pickle.dump(negExamples, handle, protocol=pickle.HIGHEST_PROTOCOL)

""" find negative instances for all tokens.
"""
Expand Down
Loading