iral-lab · ghost · Aug 12, 2019 · Aug 15, 2019 · Apr 8, 2020 · Apr 8, 2020
diff --git a/.DS_Store b/.DS_Store
diff --git a/.gitignore b/.gitignore
@@ -13,3 +13,4 @@ conf_files/UW_english/UW_AMT_description_documents_per_image_nopreproc_stop_raw.
 Validation/get_just_results.py
 Validation/interval_run.py
 Validation/macro-pos5DescrNegDocVecdistractorTest.py
+.DS_Store
diff --git a/GroundedLanguageLearning.py b/GroundedLanguageLearning.py
@@ -0,0 +1,106 @@
+# Created by: Luke Richards
+# Purpose: This file outlines a general framework for a more modular GLS. This code as of now will not run as it is pseudo-code written by Frank Ferraro
+
+import torch
+import torch.nn as nn
+import torch.nn.functional as F
+import torch.optim
+import pandas as pd
+
+class GroundedLanguageClassifier(nn.Module):
+    def __init__(self, text_encoder, percept_encoder, corr_scorer):
+
+        super(GroundedLanguageClassifier, self).__init__()
+
+        self.text_encoder = text_encoder
+        self.percept_encoder = percept_encoder
+        self.scoring = corr_scorer
+    def forward(self, text, percepts):
+        #changing rep with representation
+        text_rep = self.text_encoder(text)
+        percept_rep = self.percept_encoder(percepts)
+        correspond_score = self.scoring(text_rep, percept_rep)
+        return correspond_score
+
+class EmptyExtractor(nn.Module): #Semantic parsing: Luke's research
+    def __init__(self):
+        super(EmptyExtractor, self).__init__()
+        pass
+    def forward(self, *args):
+        return torch.empty(0)
+
+# Extract features from Luke RSS 2019 dataset
+# from a csv file, read in file, forward a single vector for given image name
+class CNNFeatureExtractor(nn.Module):
+    def __init__(self, filePath,num_features):
+        super(CNNFeatureExtractor, self).__init__()
+
+        self.data = pd.read_csv(filePath)
+        self.data.set_index('image_name', inplace=True)
+
+    def forward(self, image_name,*args):
+        return self.data.loc[image_name, :].values
+
+class MultiLabelBinaryMLPScorer(nn.Module):
+    def __init__(self, input_size, num_classifiers,
+        num_layers=0, layer_size=[],
+        activation=nn.functional.tanh):
+
+        layer_size_list = self._get_layer_dims(num_layers, layer_size)
+        self.layers, prev_size = [], input_size
+        self.activation_fns = []
+        for l in layer_size_list:
+            self.activation_fns.append(activation)
+            self.layers.append(nn.Linear(prev_size, l))
+            prev_size = l
+        self.layers.append(nn.Linear(prev_size, num_classifiers))
+        self.activation_fns.append(nn.Sequential())
+
+    def _get_layer_dims(self, num_layers, layer_size):
+        if type(layer_size) == type([]):
+            if len(layer_size) == num_layers:
+                layer_size_list = layer_size
+            else:
+                raise Exception("bad")
+        elif type(layer_size) == type(0):
+            layer_size_list = [layer_size for _ in range(num_layers)]
+        return layer_size_list
+
+    def forward(self, text_feats, vis_feats):
+        logits = concat(text_feats, vis_feats)
+        for act_fn, layer in zip(self.activation_fns, self.layers):
+            logits = act_fn(layer(logits))
+        return nn.functional.logsigmoid(logits)
+
+class GLSLearner:
+    def __init__(self, model, loss = nn.NLLLoss(), optim_factory = torch.optim.Adam):
+        self.model = model
+        self.loss = loss
+        self.optimizer = optim_factory(self.model.parameters())
+
+    def train(self, train_dataset, val_dataset, train_params):
+        iter_index = 0
+        ## assume train_params has a function that allows us to train
+        ## for a max number of iterations, or has some criteria based
+        ## on the loss and/or its gradient
+        while train_params.continue_training(loss_val, iter_index):
+            train_batch = get_next_batch(train_dataset, iter_index)
+            predicted_values = self.model(train_batch.text, train_batch.percepts)
+            loss_score = self.loss(y_true=train_batch.labels,
+                             y_pred=predicted_values)
+            loss_score.backward()
+            self.optimizer.step()
+            if train_params.do_val(iter_index):
+                val_batch = get_next_batch(val_dataset, iter_index)
+                predictions = self.predictions(val_batch)
+                self.evaluate(val_batch.labels, predictions)
+            iter_index += 1
+
+    def predict(self, dataset):
+        # did you mean test phase?
+        ## get predictions
+        return predictions
+
+    def evaluate(self, y_true, y_pred):
+        ## perform whatever evaluations
+        print("eval")
diff --git a/OLD GLS/.DS_Store b/OLD GLS/.DS_Store
diff --git a/OLD GLS/NegativeExampleDataGenerator.py b/OLD GLS/NegativeExampleDataGenerator.py
@@ -0,0 +1,107 @@
+import numpy as np
+import math
+import os
+import sys
+import collections
+import pickle
+from gensim.models.doc2vec import LabeledSentence
+from gensim.models import Doc2Vec
+
+
+######## Negative Example Generation##########
+class LabeledLineSentence(object):
+    def __init__(self,docLists,docLabels):
+        self.docLists = docLists
+        self.docLabels = docLabels
+
+    def __iter__(self):
+        for index, arDoc in enumerate(self.docLists):
+            yield LabeledSentence(arDoc, [self.docLabels[index]])
+
+    def to_array(self):
+        self.sentences = []
+        for index, arDoc in enumerate(self.docLists):
+            self.sentences.append(LabeledSentence(arDoc, [self.docLabels[index]]))
+        return self.sentences
+
+    def sentences_perm(self):
+        from random import shuffle
+        shuffle(self.sentences)
+        return self.sentences
+
+
+class NegSampleSelection:
+    # using __slots__ to not to use a dict for the sake of space and speed
+    __slots__ = ['docs']
+    docs = {}
+
+    def __init__(self, docs):        
+        docs = collections.OrderedDict(sorted(docs.items()))
+        self.docs = docs
+
+    def sentenceToWordLists(self):
+        docLists = []
+        docs = self.docs
+        for key in docs.keys():
+            sent = docs[key]
+            wLists = sent.split(" ")
+            docLists.append(wLists)
+        return docLists
+
+    def square_rooted(self,x):
+        return round(math.sqrt(sum([a*a for a in x])),3)
+
+    def cosine_similarity(self,x,y):
+        numerator = sum(a*b for a,b in zip(x,y))
+        denominator = self.square_rooted(x)*self.square_rooted(y)
+        return round(numerator/float(denominator),3)
+
+    def generateNegatives(self):
+        docs = self.docs
+        docNames = docs.keys()
+        docLists = self.sentenceToWordLists()
+        docLabels = []
+        for key in docNames:
+            ar = key.split("/")
+            docLabels.append(ar[1])
+        sentences = LabeledLineSentence(docLists,docLabels)
+        model = Doc2Vec(min_count=1, window=10, size=2000, sample=1e-4, negative=5, workers=8)
+        model.build_vocab(sentences.to_array())
+        token_count = sum([len(sentence) for sentence in sentences])
+        for epoch in range(10):
+            model.train(sentences.sentences_perm(),total_examples = token_count,epochs=model.iter)
+            model.alpha -= 0.002 # decrease the learning rate
+            model.min_alpha = model.alpha # fix the learning rate, no deca
+            model.train(sentences.sentences_perm(),total_examples = token_count,epochs=model.iter)
+
+        degreeMap = {}
+        for i , item1 in enumerate(docLabels):
+            fDoc = model.docvecs[docLabels[i]]
+            cInstMap = {}
+            cInstance = docNames[i]
+            for j,item2 in enumerate(docLabels):
+                tDoc = model.docvecs[docLabels[j]]
+                cosineVal = max(-1.0,min(self.cosine_similarity(fDoc,tDoc),1.0))
+                try:
+              	    cValue = math.degrees(math.acos(cosineVal))
+                except:
+                    print("ERROR: invalid cosine value")
+                    print cosineVal
+                    print fDoc
+                    print tDoc
+                    exit()
+                tInstance = docNames[j]
+                cInstMap[tInstance] = cValue
+            degreeMap[cInstance] = cInstMap
+        negInstances = {}
+        for k in np.sort(degreeMap.keys()):
+            v = degreeMap[k]
+            ss = sorted(v.items(), key=lambda x: x[1])
+            sentAngles = ""
+            for item in ss:
+                if item[0] != k:
+                    sentAngles += item[0]+"-"+str(item[1])+","
+            sentAngles = sentAngles[:-1]
+            negInstances[k] = sentAngles
+        return negInstances
+############Negative Example Generation --- END ########
diff --git a/OLD GLS/README.md b/OLD GLS/README.md
@@ -0,0 +1,33 @@
+# Grounded language Learning System
+
+General system framework for learning word-as-classifer groundings
+
+### Prerequisites
+
+- python 2.7 
+- pandas
+- genism 
+
+## Running the tests
+
+Visual features should be stored in a folder one diectory above GLS
+
+
+#### Preprocessing language input
+
+```
+python2 preprocess_descriptions.py <language name> <language> <"stop", "lemm", or "stemm">
+```
+
+#### Learning
+
+```
+python2 cLL-ML.py --resDir <folder for result output> --cat <category of learning: rgb, shape, object, all> --pre <formated language conf file> --cutoff <threshold for negative example selection> --seed <seed for random selection> --visfeat <location of visual feature folder hierarchy>  --listof <list of instances or images conf file> --negexmpl <optional: import negative examples previously computed to save time> 
+```
+
+#### Testing / Validation 
+
+```
+python2 macro-pos5DescrNegDocVecdistractorTest.py <result folder>/NoOfDataPoints/ <category: rgb, shape, object, all> <category: rgb, shape, object, all> <formated language conf file>  
+
+```
diff --git a/UW_list_of_instances.conf → OLD GLS/UW_list_of_instances.conf b/UW_list_of_instances.conf → OLD GLS/UW_list_of_instances.conf
diff --git a/Validation/OG/Untitled → OLD GLS/Validation/OG/Untitled b/Validation/OG/Untitled → OLD GLS/Validation/OG/Untitled
diff --git a/Validation/get_just_results.py → OLD GLS/Validation/get_just_results.py b/Validation/get_just_results.py → OLD GLS/Validation/get_just_results.py
diff --git a/Validation/interval_run.py → OLD GLS/Validation/interval_run.py b/Validation/interval_run.py → OLD GLS/Validation/interval_run.py
diff --git a/...macro-pos5DescrNegDocVecdistractorTest.py → ...macro-pos5DescrNegDocVecdistractorTest.py b/...macro-pos5DescrNegDocVecdistractorTest.py → ...macro-pos5DescrNegDocVecdistractorTest.py
diff --git a/Validation/process_output.py → OLD GLS/Validation/process_output.py b/Validation/process_output.py → OLD GLS/Validation/process_output.py
diff --git a/Validation/python2 → OLD GLS/Validation/python2 b/Validation/python2 → OLD GLS/Validation/python2
diff --git a/Validation/read_in_results.py → OLD GLS/Validation/read_in_results.py b/Validation/read_in_results.py → OLD GLS/Validation/read_in_results.py
diff --git a/Validation/util.py → OLD GLS/Validation/util.py b/Validation/util.py → OLD GLS/Validation/util.py
diff --git a/Validation/util.pyc → OLD GLS/Validation/util.pyc b/Validation/util.pyc → OLD GLS/Validation/util.pyc
diff --git a/cLL-ML.py → OLD GLS/cLL-ML.py b/cLL-ML.py → OLD GLS/cLL-ML.py
@@ -1,5 +1,6 @@
 # -*- coding: utf-8 -*-
 #!/usr/bin/env python
+from NegativeExampleDataGenerator import *
 import numpy as np
 import codecs
 from sklearn import preprocessing
@@ -111,128 +112,7 @@ def fileAppend(fName, sentence):
     myfile.write(sentence)
     myfile.write("\n")
 
-######## Negative Example Generation##########
-class LabeledLineSentence(object):
-    def __init__(self,docLists,docLabels):
-        self.docLists = docLists
-        self.docLabels = docLabels
-
-    def __iter__(self):
-        for index, arDoc in enumerate(self.docLists):
-            yield LabeledSentence(arDoc, [self.docLabels[index]])
-
-    def to_array(self):
-        self.sentences = []
-        for index, arDoc in enumerate(self.docLists):
-            self.sentences.append(LabeledSentence(arDoc, [self.docLabels[index]]))
-        return self.sentences
-
-    def sentences_perm(self):
-        from random import shuffle
-        shuffle(self.sentences)
-        return self.sentences
-
-class NegSampleSelection:
-   """ Class to bundle negative example generation functions and variables. """
-   __slots__ = ['docs']
-   docs = {}
-   def __init__(self,docs):
-      """""""""""""""""""""""""""""""""""""""""
-                Initialization function for NegSampleSelection class
-                Args: Documents dictionary where key is object instance and value
-                      is object annotation
-                Returns: Nothing
-      """""""""""""""""""""""""""""""""""""""""
-      docs = collections.OrderedDict(sorted(docs.items()))
-      self.docs = docs
-
-   def sentenceToWordLists(self):
-      docLists = []
-      docs = self.docs
-      for key in docs.keys():
-         sent = docs[key]
-         wLists = sent.split(" ")
-         docLists.append(wLists)
-      return docLists
-
-   def sentenceToWordDicts(self):
-      docs = self.docs
-      docDicts = {}
-      for key in docs.keys():
-         sent = docs[key]
-         wLists = sent.split(" ")
-         docDicts[key] = wLists
-      return docDicts
-
-   def square_rooted(self,x):
-      return round(math.sqrt(sum([a*a for a in x])),3)
-
-   def cosine_similarity(self,x,y):
-      numerator = sum(a*b for a,b in zip(x,y))
-      denominator = self.square_rooted(x)*self.square_rooted(y)
-      return round(numerator/float(denominator),3)
-
-   def generateNegatives(self):
-      docs = self.docs
-      docNames = docs.keys()
-      docLists = self.sentenceToWordLists()
-      docDicts = self.sentenceToWordDicts()
-      docLabels = []
-      for key in docNames:
-        ar = key.split("/")
-        docLabels.append(ar[1])
-      sentences = LabeledLineSentence(docLists,docLabels)
-      model = Doc2Vec(min_count=1, window=10, size=2000, sample=1e-4, negative=5, workers=8)
-
-      model.build_vocab(sentences.to_array())
-      token_count = sum([len(sentence) for sentence in sentences])
-      for epoch in range(10):
-          model.train(sentences.sentences_perm(),total_examples = token_count,epochs=model.iter)
-          model.alpha -= 0.002 # decrease the learning rate
-          model.min_alpha = model.alpha # fix the learning rate, no deca
-          model.train(sentences.sentences_perm(),total_examples = token_count,epochs=model.iter)
-
-      degreeMap = {}
-      for i , item1 in enumerate(docLabels):
-         fDoc = model.docvecs[docLabels[i]]
-         cInstMap = {}
-         cInstance = docNames[i]
-         for j,item2 in enumerate(docLabels):
-
-            tDoc = model.docvecs[docLabels[j]]
-            cosineVal = max(-1.0,min(self.cosine_similarity(fDoc,tDoc),1.0))
-
-
-            try:
-            	cValue = math.degrees(math.acos(cosineVal))
-            except:
-                print("ERROR: invalid cosine value")
-                print cosineVal
-                print fDoc
-                print tDoc
-                exit()
-            tInstance = docNames[j]
-            cInstMap[tInstance] = cValue
-         degreeMap[cInstance] = cInstMap
-      negInstances = {}
-      for k in np.sort(degreeMap.keys()):
-        v = degreeMap[k]
-        ss = sorted(v.items(), key=lambda x: x[1])
-        sentAngles = ""
-        for item in ss:
-          if item[0] != k:
-             sentAngles += item[0]+"-"+str(item[1])+","
-        sentAngles = sentAngles[:-1]
-        negInstances[k] = sentAngles
-
-	  # pickle negative examples for later use
-
-      with open('NegExamples_'+ resultDir + '.pickle', 'wb') as handle:
-		     pickle.dump(negInstances, handle, protocol=pickle.HIGHEST_PROTOCOL)
-
-      return negInstances
-
-############Negative Example Generation --- END ########
+
 
 class Category:
    """ Class to bundle our dataset functions and variables category wise. """
@@ -610,6 +490,8 @@ def getDataSet(self,cDf,nDf,tests,fName):
             negExamples = pickle.load(handle)
       else:
          negExamples = negSelection.generateNegatives()
+         with open('NegExamples_RECENT.pickle', 'wb') as handle:
+         	pickle.dump(negExamples, handle, protocol=pickle.HIGHEST_PROTOCOL)
 
       """ find negative instances for all tokens.
       """