diff --git a/equivalent_ra.py b/equivalent_ra.py new file mode 100644 index 00000000..438e2d34 --- /dev/null +++ b/equivalent_ra.py @@ -0,0 +1,136 @@ +def ra_eq1(query): + ''' + σθ1∧θ2(E) = σθ1(σθ2(E)) + ''' + #Έλεγχος για το αν υπάρχει condition στο where και αν υπαρχει και and + if query['where'] is not None and query['where'].find('and') != -1: + #Split το condition σε 2 μέρη και ανακατασκευή. + condition1, condition2 = query['where'].split('and') + equiv_query = query.copy() + equiv_query['where'] = condition1 + equiv_query['from'] = {'select': '*', 'from': equiv_query['from'], 'where': condition2,'distinct': None, 'orderby': None,'limit': None,'desc': None} + return equiv_query + return None + +def ra_eq2(query): + ''' + E1 inner join θ E2 ---> E2 inner join θ + ''' + if 'from' in query and 'join' in query['from']: + left = query['from']['left'] + right = query['from']['right'] + equiv_query = query.copy() + equiv_query['from']['left'] = right + equiv_query['from']['right'] = left + return equiv_query + else: + return query + +def ra_eq3(query): + ''' + select θ (Ε1 Inner Join θ1 Ε2) ----> (select θ Ε1) Inner Join θ1 Ε2 + ''' + if 'from' in query and 'join' in query['from'] and query['from']['join'] == 'inner': + left_table = query['from']['left'] + right_table = query['from']['right'] + on_condition = query['from']['on'] + where_condition = query['where'] + + #Ελεγχος για το table στο οποίο ανήκει το condition (π.χ classroom.capacity το condition ανήκει στο table classroom) + if where_condition and where_condition.startswith(f"{left_table}."): + nested_left = { + 'select': query['select'], + 'from': left_table, + 'where': where_condition + } + nested_right = right_table + elif where_condition and where_condition.startswith(f"{right_table}."): + nested_left = left_table + nested_right = { + 'select': query['select'], + 'from': right_table, + 'where': where_condition + } + else: + nested_left = left_table + nested_right = right_table + + transformed_query = { + 'select': query['select'], + 'from': { + 'join': 'inner', + 'left': nested_left, + 'right': nested_right, + 'on': on_condition + }, + 'where': None, # Αφου το where condition έχει ήδη γίνει apply πιο πάνω. + 'distinct': query['distinct'], + 'order by': query['order by'], + 'limit': query['limit'] + } + print("OEOOOO") + print(transformed_query) + return transformed_query + else: + return query + +def ra_eq4(query): + ''' + Given a query, apply the transformation rule + select θ1 AND θ2 (Ε1 inner join Ε2) -> (select θ1 Ε1) INNER JOIN (select θ2 Ε2) + ''' + if 'from' in query and 'join' in query['from'] and query['from']['join'] == 'inner': + left_table = query['from']['left'] + right_table = query['from']['right'] + on_condition = query['from']['on'] + where_condition = query['where'] + + theta1, theta2 = where_condition.split('and') + + transformed_query_left = { + 'select': query['select'], + 'from': left_table, + 'where': "", + 'distinct': query['distinct'], + 'order by': query['order by'], + 'limit': query['limit'] + } + + transformed_query_right = { + 'select': query['select'], + 'from': right_table, + 'where': "", + 'distinct': query['distinct'], + 'order by': query['order by'], + 'limit': query['limit'] + } + + #Λοοπ μέσα από όλα τα conditions και κάνει assign το κάθε where condition στο αντίστοιχο table + for condition in (theta1, theta2): + condition = condition.strip() + if left_table in condition: + transformed_query_left['where'] += f"{condition}" + elif right_table in condition: + transformed_query_right['where'] += f"{condition}" + + transformed_query_left['where'] = transformed_query_left['where'].rstrip('and') + transformed_query_right['where'] = transformed_query_right['where'].rstrip('and') + + + final_transformed_query = { + 'select': '*', + 'from': { + 'join': 'inner', + 'left': transformed_query_left, + 'right': transformed_query_right, + 'on': on_condition + }, + 'where': None, + 'distinct': query['distinct'], + 'order by': query['order by'], + 'limit': query['limit'] + } + + return final_transformed_query + + return query \ No newline at end of file diff --git a/mdb.py b/mdb.py index a981e5be..69b32de6 100644 --- a/mdb.py +++ b/mdb.py @@ -2,11 +2,16 @@ import re from pprint import pprint import sys -import readline +#import pyreadline import traceback import shutil sys.path.append('miniDB') +from equivalent_ra import ra_eq1 +from equivalent_ra import ra_eq2 +from equivalent_ra import ra_eq3 +from equivalent_ra import ra_eq4 + from database import Database from table import Table # art font is "big" @@ -89,23 +94,44 @@ def create_query_plan(query, keywords, action): else: dic['desc'] = False dic['order by'] = dic['order by'].removesuffix(' asc').removesuffix(' desc') - else: dic['desc'] = None - if action=='create table': args = dic['create table'][dic['create table'].index('('):dic['create table'].index(')')+1] dic['create table'] = dic['create table'].removesuffix(args).strip() + arg_nopk = args.replace('primary key', '')[1:-1] + arglist = [val.strip().split(' ') for val in arg_nopk.split(',')] + dic['column_names'] = ','.join([val[0] for val in arglist]) + dic['column_types'] = ','.join([val[1] for val in arglist]) if 'primary key' in args: - arglist = args[1:-1].split(' ') - dic['primary key'] = arglist[arglist.index('primary')-2] + arglist_has_pkey = args[1:-1].split(' ') + + dic['primary key'] = arglist_has_pkey[arglist_has_pkey.index('primary')-2] else: dic['primary key'] = None + + + unique_columns=[] + + + + for col in arglist: + if 'unique' in col: + + unique_columns.append(col[0]) + + if len(unique_columns)!=0: + dic['unique'] = ','.join(unique_columns) + else: + dic['unique']=None + + + if action=='import': dic = {'import table' if key=='import' else key: val for key, val in dic.items()} @@ -120,6 +146,23 @@ def create_query_plan(query, keywords, action): dic['force'] = True else: dic['force'] = False + + if action=='create index': + + args = dic['on'].split(' ') + dic['on'] = dic['on'].split(' ')[0] + if len(args) > 1: + + dic['column'] = args[2] + else: + dic['column'] = 'pkey' + args = dic['using'].split(' ') + dic['using'] = dic['using'].split(' ')[0] + if len(args) > 1: + dic['index_type'] = args[1] + + + return dic @@ -157,7 +200,8 @@ def evaluate_from_clause(dic): join_dic['right'] = interpret(join_dic['right'][1:-1].strip()) dic['from'] = join_dic - + print(ra_eq4(dic)) + print(dic) return dic def interpret(query): @@ -200,6 +244,7 @@ def execute_dic(dic): dic[key] = execute_dic(dic[key]) action = list(dic.keys())[0].replace(' ','_') + return getattr(db, action)(*dic.values()) def interpret_meta(command): @@ -251,7 +296,7 @@ def remove_db(db_name): dbname = os.getenv('DB') db = Database(dbname, load=True) - + if fname is not None: @@ -262,6 +307,7 @@ def remove_db(db_name): pprint(dic, sort_dicts=False) else : dic = interpret(line.lower()) + result = execute_dic(dic) if isinstance(result,Table): result.show() diff --git a/miniDB/btree.py b/miniDB/btree.py index f0676209..6b3a6fdc 100644 --- a/miniDB/btree.py +++ b/miniDB/btree.py @@ -116,6 +116,7 @@ def insert(self, value, ptr, rptr=None): # insert to it self.nodes[index].insert(value,ptr) # if the node has more elements than b-1, split the node + if len(self.nodes[index].values)==self.b: self.split(index) @@ -309,6 +310,7 @@ def find(self, operator, value): if operator == '>': for idx, node_value in enumerate(target_node.values): ops+=1 + if node_value > value: results.append(target_node.ptrs[idx]) while target_node.right_sibling is not None: diff --git a/miniDB/database.py b/miniDB/database.py index a3ac6be7..cc26aa63 100644 --- a/miniDB/database.py +++ b/miniDB/database.py @@ -4,13 +4,15 @@ import os,sys import logging import warnings -import readline +#import pyreadline +import re from tabulate import tabulate sys.path.append(f'{os.path.dirname(os.path.dirname(os.path.abspath(__file__)))}/miniDB') from miniDB import table sys.modules['table'] = table +from hashidx import ExtendibleHashing as hash from joins import Inlj, Smj from btree import Btree from misc import split_condition @@ -54,7 +56,7 @@ def __init__(self, name, load=True, verbose = True): self.create_table('meta_length', 'table_name,no_of_rows', 'str,int') self.create_table('meta_locks', 'table_name,pid,mode', 'str,int,str') self.create_table('meta_insert_stack', 'table_name,indexes', 'str,list') - self.create_table('meta_indexes', 'table_name,index_name', 'str,str') + self.create_table('meta_indexes', 'table_name,index_name,index_type,column', 'str,str,str,str') self.save_database() def save_database(self): @@ -101,7 +103,7 @@ def _update(self): self._update_meta_insert_stack() - def create_table(self, name, column_names, column_types, primary_key=None, load=None): + def create_table(self, name, column_names, column_types, primary_key=None, unique=None,load=None): ''' This method create a new table. This table is saved and can be accessed via db_object.tables['table_name'] or db_object.table_name @@ -113,7 +115,7 @@ def create_table(self, name, column_names, column_types, primary_key=None, load= load: boolean. Defines table object parameters as the name of the table and the column names. ''' # print('here -> ', column_names.split(',')) - self.tables.update({name: Table(name=name, column_names=column_names.split(','), column_types=column_types.split(','), primary_key=primary_key, load=load)}) + self.tables.update({name: Table(name=name, column_names=column_names.split(','), column_types=column_types.split(','), primary_key=primary_key,unique=unique, load=load)}) # self._name = Table(name=name, column_names=column_names, column_types=column_types, load=load) # check that new dynamic var doesnt exist already # self.no_of_tables += 1 @@ -160,7 +162,7 @@ def drop_table(self, table_name): self.save_database() - def import_table(self, table_name, filename, column_types=None, primary_key=None): + def import_table(self, table_name, filename, column_types=None, primary_key=None, unique=None): ''' Creates table from CSV file. @@ -177,7 +179,7 @@ def import_table(self, table_name, filename, column_types=None, primary_key=None colnames = line.strip('\n') if column_types is None: column_types = ",".join(['str' for _ in colnames.split(',')]) - self.create_table(name=table_name, column_names=colnames, column_types=column_types, primary_key=primary_key) + self.create_table(name=table_name, column_names=colnames, column_types=column_types, primary_key=primary_key,unique=unique) lock_ownership = self.lock_table(table_name, mode='x') first_line = False continue @@ -267,6 +269,7 @@ def insert_into(self, table_name, row_str): self.load_database() # fetch the insert_stack. For more info on the insert_stack # check the insert_stack meta table + lock_ownership = self.lock_table(table_name, mode='x') insert_stack = self._get_insert_stack_for_table(table_name) try: @@ -274,6 +277,7 @@ def insert_into(self, table_name, row_str): except Exception as e: logging.info(e) logging.info('ABORTED') + self._update_meta_insert_stack_for_tb(table_name, insert_stack[:-1]) if lock_ownership: @@ -356,21 +360,102 @@ def select(self, columns, table_name, condition, distinct=None, order_by=None, \ self.load_database() if isinstance(table_name,Table): return table_name._select_where(columns, condition, distinct, order_by, desc, limit) - + + condition_column="" + op="" if condition is not None: - condition_column = split_condition(condition)[0] - else: - condition_column = '' + if "between" in condition.split(): + condition_column = condition.split()[0] + elif "and" in condition.split() : + columns2=[] + conditions=[] + conditions=condition.split(" and ") + for con in conditions: + col=self.tables[table_name]._parse_condition(con) + columns2.append(col) + for con in columns2: + if (self.tables[table_name].pk_idx is not None and con[0] == self.tables[table_name].column_names[self.tables[table_name].pk_idx]) or (self.tables[table_name].unique is not None and con[0] in self.tables[table_name].unique ):#since only pk supports index , and only one pk per table we keep the column if it is pk + condition_column = con[0] + else: + condition_column="" + + elif "or" in condition.split(): + + columns2=[] + conditions=[] + conditions=condition.split(" or ") + for con in conditions: + + col=self.tables[table_name]._parse_condition(con) + + columns2.append(col) + + for con in columns2: + + + if self.tables[table_name].pk_idx is not None and con[0] == self.tables[table_name].column_names[self.tables[table_name].pk_idx]: + condition_column = con[0] + else: + condition_column = "" + else: + + col,op,_= self.tables[table_name]._parse_condition(condition) + condition_column=col + + + + + # self.lock_table(table_name, mode='x') if self.is_locked(table_name): return - if self._has_index(table_name) and condition_column==self.tables[table_name].column_names[self.tables[table_name].pk_idx]: - index_name = self.select('*', 'meta_indexes', f'table_name={table_name}', return_object=True).column_by_name('index_name')[0] - bt = self._load_idx(index_name) - table = self.tables[table_name]._select_where_with_btree(columns, bt, condition, distinct, order_by, desc, limit) + + + if self.tables[table_name].pk_idx is not None and self.tables[table_name].pk == condition_column and self._has_index(table_name,condition_column): + + index_name = self.select('*', 'meta_indexes', f'table_name={table_name}',return_object=True).column_by_name('index_name')[0] + index_type = self.select('*', 'meta_indexes', f'table_name={table_name}',return_object=True).column_by_name('index_type')[0] + + if index_type=="btree": + + bt = self._load_idx(index_name) + table = self.tables[table_name]._select_where_with_btree(columns, bt, condition, distinct, order_by, desc, limit) + elif index_type=="hash": + print("hash") + if op== "=": + bt = self._load_idx(index_name) + table = self.tables[table_name]._select_where_with_hash(columns, bt, condition, distinct, order_by, desc, limit) + else: + table = self.tables[table_name]._select_where(columns, condition, distinct, order_by, desc, limit) + elif self.tables[table_name].unique_idx is not None and self._has_index(table_name,condition_column): + + found = False + for j in range (len(self.tables[table_name].unique)): + + if self.tables[table_name].unique[j] in condition_column: + found = True + index_name = self.select('*', 'meta_indexes', f'table_name={table_name}',return_object=True).column_by_name('index_name')[0] + index_type = self.select('*', 'meta_indexes', f'table_name={table_name}',return_object=True).column_by_name('index_type')[0] + + if index_type=="btree": + + bt = self._load_idx(index_name) + table = self.tables[table_name]._select_where_with_btree(columns, bt, condition, distinct, order_by, desc, limit) + elif index_type=="hash": + print("hash") + if op== "=": + bt = self._load_idx(index_name) + table = self.tables[table_name]._select_where_with_hash(columns, bt, condition, distinct, order_by, desc, limit) + else: + table = self.tables[table_name]._select_where(columns, condition, distinct, order_by, desc, limit) + break + if not found: + + table = self.tables[table_name]._select_where(columns, condition, distinct, order_by, desc, limit) else: + table = self.tables[table_name]._select_where(columns, condition, distinct, order_by, desc, limit) # self.unlock_table(table_name) if save_as is not None: @@ -581,7 +666,9 @@ def _update_meta_length(self): ''' Updates the meta_length table. ''' + for table in self.tables.values(): + if table._name[:4]=='meta': #skip meta tables continue if table._name not in self.tables['meta_length'].column_by_name('table_name'): # if new table, add record with 0 no. of rows @@ -650,7 +737,7 @@ def _update_meta_insert_stack_for_tb(self, table_name, new_stack): # indexes - def create_index(self, index_name, table_name, index_type='btree'): + def create_index(self, index_name, table_name,index_type='btree' ,column='pkey'): ''' Creates an index on a specified table with a given name. Important: An index can only be created on a primary key (the user does not specify the column). @@ -659,21 +746,38 @@ def create_index(self, index_name, table_name, index_type='btree'): table_name: string. Table name (must be part of database). index_name: string. Name of the created index. ''' - if self.tables[table_name].pk_idx is None: # if no primary key, no index - raise Exception('Cannot create index. Table has no primary key.') + #if the user didn't specify a column, make the index on primary key + if self.tables[table_name].pk_idx is not None and column=='pkey': + column = self.tables[table_name].pk + + + if self.tables[table_name].pk_idx is None and self.tables[table_name].unique is None: # if no primary key, no index + raise Exception('Cannot create index. Table has no primary key or unique columns.') + if index_name not in self.tables['meta_indexes'].column_by_name('index_name'): # currently only btree is supported. This can be changed by adding another if. if index_type=='btree': logging.info('Creating Btree index.') + print('Creating BTREE index') # insert a record with the name of the index and the table on which it's created to the meta_indexes table - self.tables['meta_indexes']._insert([table_name, index_name]) + self.tables['meta_indexes']._insert([table_name, index_name,index_type,column]) # crate the actual index - self._construct_index(table_name, index_name) + self._construct_index(table_name, index_name,column,index_type) + #self._construct_index(table_name, index_name) + self.save_database() + if index_type=='hash': + logging.info('Creating Hash index.') + print('Creating HASH index') + # insert a record with the name of the index and the table on which it's created to the meta_indexes table + self.tables['meta_indexes']._insert([table_name, index_name,index_type,column]) + # crate the actual index + self._construct_index(table_name, index_name,column,index_type) + #self._construct_index(table_name, index_name) self.save_database() else: raise Exception('Cannot create index. Another index with the same name already exists.') - def _construct_index(self, table_name, index_name): + def _construct_index(self, table_name, index_name,column,index_type='btree'): ''' Construct a btree on a table and save. @@ -681,25 +785,61 @@ def _construct_index(self, table_name, index_name): table_name: string. Table name (must be part of database). index_name: string. Name of the created index. ''' - bt = Btree(3) # 3 is arbitrary - - # for each record in the primary key of the table, insert its value and index to the btree - for idx, key in enumerate(self.tables[table_name].column_by_name(self.tables[table_name].pk)): - if key is None: - continue - bt.insert(key, idx) - # save the btree - self._save_index(index_name, bt) - - - def _has_index(self, table_name): + if index_type=='btree': + bt = Btree(1) # 3 is arbitrary + + # for each record in the primary key of the table, insert its value and index to the btree + if self.tables[table_name].pk is not None and column==self.tables[table_name].pk: + for idx, key in enumerate(self.tables[table_name].column_by_name(column)): + if key is None: + continue + bt.insert(key, idx) + + elif self.tables[table_name].unique is not None and column in self.tables[table_name].unique: + + for idx, key in enumerate(self.tables[table_name].column_by_name(column)): + + if key is None: + continue + bt.insert(key, idx) + + else: + raise ValueError(f'##ERROR->{column} is not primary key or unique') + # save the btree + + self._save_index(index_name, bt) + elif index_type=='hash': + hi = hash(1) + if self.tables[table_name].pk is not None and column==self.tables[table_name].pk: + for idx, key in enumerate(self.tables[table_name].column_by_name(column)): + if key is None: + + continue + + hi.insert(idx, key)#idx will be used as the hashing key, which is the order that the values are in the table + #key is the value that will be inserted + elif self.tables[table_name].unique is not None and column in self.tables[table_name].unique: + + for idx, key in enumerate(self.tables[table_name].column_by_name(column)): + + if key is None: + continue + hi.insert(idx,key) + self._save_index(index_name, hi) + hi.show() + def _has_index(self, table_name, column): ''' Check whether the specified table's primary key column is indexed. Args: - table_name: string. Table name (must be part of database). + table_name: string. Table name (must be part of the database). + column: string. Column name to check for indexing. ''' - return table_name in self.tables['meta_indexes'].column_by_name('table_name') + if table_name in self.tables['meta_indexes'].column_by_name('table_name') and column in self.tables['meta_indexes'].column_by_name('column'): + return True + else: + return False + def _save_index(self, index_name, index): ''' diff --git a/miniDB/hashidx.py b/miniDB/hashidx.py new file mode 100644 index 00000000..49d71f04 --- /dev/null +++ b/miniDB/hashidx.py @@ -0,0 +1,87 @@ +class ExtendibleHashing: + def __init__(self, global_depth): + self.global_depth = global_depth + self.directory = {} + self.bucket_size = 4 # Number of elements in each bucket + + def hash_function(self, key): + + hash_value = 0 + for char in key: + hash_value = (hash_value * 31 + ord(char)) % (2 ** self.global_depth) + return hash_value + + def insert(self, key, value): + hashed_key = self.hash_function(value) + if hashed_key in self.directory: + bucket = self.directory[hashed_key] + for i, (k, v) in enumerate(bucket): + if k == value: + bucket[i] = (k, value) # Update value for existing key + return + if len(bucket) < self.bucket_size: + bucket.append((key, value)) + else: + if self.global_depth == len(bin(hashed_key)) - 2: + self.double_directory() + self.split_bucket(hashed_key) + self.insert(key, value) + else: + if self.global_depth == len(bin(hashed_key)) - 2: + self.double_directory() + self.directory[hashed_key] = [(key, value)] + + + def double_directory(self): + self.global_depth += 1 + directory_size = 2 ** (self.global_depth - 1) + for i in range(directory_size): + if i in self.directory: + self.directory[i + directory_size] = self.directory[i] + else: + self.directory[i + directory_size] = [] + + def split_bucket(self, hashed_key): + bucket = self.directory[hashed_key] + new_bucket = [] + split_index = len(bucket) // 2 + + # Split the bucket into two by creating a new bucket and updating the directory + new_bucket = bucket[split_index:] + bucket = bucket[:split_index] + self.directory[hashed_key] = bucket + + # Update the hashed keys of the new bucket and its duplicates + new_hashed_key = hashed_key + (2 ** (self.global_depth - 1)) + for key in range(new_hashed_key, new_hashed_key + (2 ** (self.global_depth - 1)), 2 ** (self.global_depth - 1)): + self.directory[key] = new_bucket.copy() + + + + + def find(self, key): + self.global_depth=1 + hashed_key = self.hash_function(key) + if hashed_key in self.directory: + bucket = self.directory[hashed_key] + for item in bucket: + if item[1] == key: + return item[0] + return None + + def delete(self, key): + hashed_key = self.hash_function(key) + if hashed_key in self.directory: + bucket = self.directory[hashed_key] + for i, item in enumerate(bucket): + if item[0] == key: + del bucket[i] + return True + return False + + def show(self): + for hashed_key, bucket in self.directory.items(): + print(f"Hashed Key: {hashed_key}") + for item in bucket: + print(f" Key: {item[0]}, Value: {item[1]}") + diff --git a/miniDB/misc.py b/miniDB/misc.py index aefada74..e3b6ccfa 100644 --- a/miniDB/misc.py +++ b/miniDB/misc.py @@ -1,4 +1,5 @@ import operator +import re def get_op(op, a, b): ''' @@ -8,24 +9,38 @@ def get_op(op, a, b): '<': operator.lt, '>=': operator.ge, '<=': operator.le, - '=': operator.eq} + '=': operator.eq, + '!=': operator.ne} try: return ops[op](a,b) except TypeError: # if a or b is None (deleted record), python3 raises typerror return False -def split_condition(condition): - ops = {'>=': operator.ge, - '<=': operator.le, - '=': operator.eq, - '>': operator.gt, - '<': operator.lt} + + + +def split_condition(condition,negate=0): + + if condition.startswith("not "): + negate=-1 + condition = condition[4:].strip() + + + ops = {'>=': operator.ge, + '<=': operator.le, + '=': operator.eq, + '>': operator.gt, + '<': operator.lt, + '!=': operator.ne} + for op_key in ops.keys(): + splt=condition.split(op_key) if len(splt)>1: left, right = splt[0].strip(), splt[1].strip() + if right[0] == '"' == right[-1]: # If the value has leading and trailing quotes, remove them. right = right.strip('"') @@ -34,7 +49,20 @@ def split_condition(condition): if right.find('"') != -1: # If there are any double quotes in the value, throw. (Notice we've already removed the leading and trailing ones) raise ValueError(f'Invalid condition: {condition}\nDouble quotation marks are not allowed inside values.') - + if negate==-1: + if op_key == '<=': + op_key = '>' + elif op_key == '>=': + op_key= '<' + elif op_key == '=': + op_key ='!=' + elif op_key=='>': + op_key = '<=' + elif op_key=='<': + op_key='>=' + elif op_key=='!=': + op_key='=' + return left, op_key, right def reverse_op(op): @@ -46,5 +74,6 @@ def reverse_op(op): '>=' : '<=', '<' : '>', '<=' : '>=', - '=' : '=' + '=' : '=', + '!=': '!=' }.get(op) diff --git a/miniDB/table.py b/miniDB/table.py index f5c7d937..4ad5c75e 100644 --- a/miniDB/table.py +++ b/miniDB/table.py @@ -3,10 +3,11 @@ import pickle import os import sys +import re sys.path.append(f'{os.path.dirname(os.path.dirname(os.path.abspath(__file__)))}/miniDB') -from misc import get_op, split_condition +from misc import get_op, split_condition, reverse_op class Table: @@ -26,9 +27,10 @@ class Table: - a dictionary that includes the appropriate info (all the attributes in __init__) ''' - def __init__(self, name=None, column_names=None, column_types=None, primary_key=None, load=None): + def __init__(self, name=None, column_names=None, column_types=None, primary_key=None,unique=None, load=None): if load is not None: + # if load is a dict, replace the object dict with it (replaces the object with the specified one) if isinstance(load, dict): self.__dict__.update(load) @@ -36,6 +38,7 @@ def __init__(self, name=None, column_names=None, column_types=None, primary_key= # if load is str, load from a file elif isinstance(load, str): self._load_from_file(load) + # if name, columns_names and column types are not none elif (name is not None) and (column_names is not None) and (column_types is not None): @@ -48,7 +51,7 @@ def __init__(self, name=None, column_names=None, column_types=None, primary_key= self.column_names = column_names self.columns = [] - + for col in self.column_names: if col not in self.__dir__(): # this is used in order to be able to call a column using its name as an attribute. @@ -64,10 +67,25 @@ def __init__(self, name=None, column_names=None, column_types=None, primary_key= # if primary key is set, keep its index as an attribute if primary_key is not None: self.pk_idx = self.column_names.index(primary_key) + else: self.pk_idx = None self.pk = primary_key + + self.unique_idx = [] + + if unique is not None: + self.unique = unique.split(',') + unique=unique.split(',') + for unq in unique: + if unq in self.column_names: + self.unique_idx.append(self.column_names.index(unq)) + else: + self.unique_idx=None + self.unique=None + + # self._update() # if any of the name, columns_names and column types are none. return an empty table object @@ -110,9 +128,10 @@ def _insert(self, row, insert_stack=[]): row: list. A list of values to be inserted (will be casted to a predifined type automatically). insert_stack: list. The insert stack (empty by default). ''' + if len(row)!=len(self.column_names): raise ValueError(f'ERROR -> Cannot insert {len(row)} values. Only {len(self.column_names)} columns exist') - + for i in range(len(row)): # for each value, cast and replace it in row. try: @@ -123,13 +142,22 @@ def _insert(self, row, insert_stack=[]): except TypeError as exc: if row[i] != None: print(exc) - + # if value is to be appended to the primary_key column, check that it doesnt alrady exist (no duplicate primary keys) if i==self.pk_idx and row[i] in self.column_by_name(self.pk): raise ValueError(f'## ERROR -> Value {row[i]} already exists in primary key column.') + elif i==self.pk_idx and row[i] is None: raise ValueError(f'ERROR -> The value of the primary key cannot be None.') - + + + if self.unique_idx is not None: + if i in self.unique_idx: + for j in range (len(self.unique)): + if row[i] in self.column_by_name(self.unique[j]): + raise ValueError(f'## ERROR -> Value {row[i]} already exists in unique column.') + + # if insert_stack is not empty, append to its last index if insert_stack != []: self.data[insert_stack[-1]] = row @@ -150,19 +178,113 @@ def _update_rows(self, set_value, set_column, condition): Operatores supported: (<,<=,=,>=,>) ''' + if condition is not None: + condition = self.replace_between(condition) + sub_conditions = condition.split(" or ") + + rows = set() + for sub_cond in sub_conditions: + is_not = False + if sub_cond.startswith("not "): + is_not = True + sub_cond = sub_cond[4:] + sub_cond=sub_cond.replace("( ", "").replace(" )", "") + + conditions = sub_cond.split(" and ") + rows_L=[] + for condition_ in conditions: + column_name, operator, value = self._parse_condition(condition_) + column = self.column_by_name(column_name) + rows_L.append([ind for ind, x in enumerate(column) if get_op(operator, x, value)]) + + and_rows = set(rows_L[0]).intersection(*rows_L) if rows_L else set() + + if is_not: + not_rows = set(range(len(column))) - and_rows + rows.update(not_rows) + else: + rows.update(and_rows) + #rows.update(self.find_rows(sub_cond)) + + + rows_to_upd = list(rows) + set_column_idx = self.column_names.index(set_column) + for idx in rows_to_upd: + self.data[idx][set_column_idx] = set_value + + ''' + if condition is not None: + + if re.match(r"^\w+\s*(=|<=|>=|<|>|!=)\s*\w+$", condition) or condition.startswith("not "):#simple condition or not + column_name, operator, value = self._parse_condition(condition) + column = self.column_by_name(column_name) + set_column_idx = self.column_names.index(set_column) + for row_ind, column_value in enumerate(column): + if get_op(operator, column_value, value): + self.data[row_ind][set_column_idx] = set_value + elif re.match(r"^\w+\s+between\s+\w+\s+and\s+\w+$", condition): + + query = condition.split() + index = query.index("between") + megalutero = query[index+1] + mikrotero = query[index+3] + column_name = query[index-1] + column = self.column_by_name(column_name) + set_column_idx = self.column_names.index(set_column) + + for i,j in enumerate(column): + if j >= megalutero and j <= mikrotero: + self.data[i][set_column_idx] = set_value + elif re.match(r"^\w+\s*(=|<=|>=|<|>|!=)\s*\w+\s+or\s+\w+\s*(=|<=|>=|<|>|!=)\s*\w+$", condition): + + + conditions = condition.split(" or ") + set_column_idx = self.column_names.index(set_column) + for condition_ in conditions: + column_name, operator, value = self._parse_condition(condition_) + column = self.column_by_name(column_name) + + for row_ind, column_value in enumerate(column): + if get_op(operator, column_value, value): + self.data[row_ind][set_column_idx] = set_value + + + elif re.match(r"^\w+\s*(=|<=|>=|<|>|!=)\s*\w+\s+and\s+\w+\s*(=|<=|>=|<|>|!=)\s*\w+$", condition): + column_name = condition.split()[0] + conditions = condition.split(" and ") + set_column_idx = self.column_names.index(set_column) + rows_L=[] + for condition_ in conditions: + column_name, operator, value = self._parse_condition(condition_) + column = self.column_by_name(column_name) + + rows_L.append([ind for ind, x in enumerate(column) if get_op(operator, x, value)]) + + + + rows = set(rows_L[0]).intersection(*rows_L) + + for row in rows: + self.data[row][set_column_idx] = set_value + + else: + raise("invalid where condition") + #else: + #rows = [i for i in range(len(self.data))] + ''' # parse the condition - column_name, operator, value = self._parse_condition(condition) + ###column_name, operator, value = self._parse_condition(condition) # get the condition and the set column - column = self.column_by_name(column_name) - set_column_idx = self.column_names.index(set_column) + ### column = self.column_by_name(column_name) + ### set_column_idx = self.column_names.index(set_column) # set_columns_indx = [self.column_names.index(set_column_name) for set_column_name in set_column_names] # for each value in column, if condition, replace it with set_value - for row_ind, column_value in enumerate(column): - if get_op(operator, column_value, value): - self.data[row_ind][set_column_idx] = set_value + ### for row_ind, column_value in enumerate(column): + ### if get_op(operator, column_value, value): + #### self.data[row_ind][set_column_idx] = set_value # self._update() # print(f"Updated {len(indexes_to_del)} rows") @@ -181,15 +303,94 @@ def _delete_where(self, condition): 'value[<,<=,==,>=,>]column'. Operatores supported: (<,<=,==,>=,>) + ''' - column_name, operator, value = self._parse_condition(condition) - indexes_to_del = [] + if condition is not None: + condition = self.replace_between(condition) + sub_conditions = condition.split(" or ") + + rows = set() + for sub_cond in sub_conditions: + is_not = False + if sub_cond.startswith("not "): + is_not = True + sub_cond = sub_cond[4:] + sub_cond=sub_cond.replace("( ", "").replace(" )", "") + + conditions = sub_cond.split(" and ") + rows_L=[] + for condition_ in conditions: + column_name, operator, value = self._parse_condition(condition_) + column = self.column_by_name(column_name) + rows_L.append([ind for ind, x in enumerate(column) if get_op(operator, x, value)]) + + and_rows = set(rows_L[0]).intersection(*rows_L) if rows_L else set() + + if is_not: + not_rows = set(range(len(column))) - and_rows + rows.update(not_rows) + else: + rows.update(and_rows) + #rows.update(self.find_rows(sub_cond)) + + + indexes_to_del = list(rows) + ''' + if re.match(r"^\w+\s*(=|<=|>=|<|>|!=)\s*\w+$", condition) or condition.startswith("not "):#simple condition or not + column_name, operator, value = self._parse_condition(condition) + column = self.column_by_name(column_name) + for index, row_value in enumerate(column): + if get_op(operator, row_value, value): + indexes_to_del.append(index) + elif re.match(r"^\w+\s+between\s+\w+\s+and\s+\w+$", condition): + + query = condition.split() + index = query.index("between") + + column_name = query[index-1] + column = self.column_by_name(column_name) + for i in range (len(self.columns)): + if column_name==self.column_names[i]: + + megalutero = self.column_types[i](query[index+1]) + mikrotero = self.column_types[i](query[index+3]) + + for i,j in enumerate(column): + if j >= megalutero and j <= mikrotero: + indexes_to_del.append(i) + elif re.match(r"^\w+\s*(=|<=|>=|<|>|!=)\s*\w+\s+or\s+\w+\s*(=|<=|>=|<|>|!=)\s*\w+$", condition): + + column_name = condition.split()[0] + conditions = condition.split(" or ") + rows_L=[] + for condition_ in conditions: + + column_name, operator, value = self._parse_condition(condition_) + column = self.column_by_name(column_name) + rows_L.append([ind for ind, x in enumerate(column) if get_op(operator, x, value)]) - column = self.column_by_name(column_name) - for index, row_value in enumerate(column): - if get_op(operator, row_value, value): - indexes_to_del.append(index) + + for rlist in rows_L: + for index in rlist: + indexes_to_del.append(index) + elif re.match(r"^\w+\s*(=|<=|>=|<|>|!=)\s*\w+\s+and\s+\w+\s*(=|<=|>=|<|>|!=)\s*\w+$", condition): + column_name = condition.split()[0] + conditions = condition.split(" and ") + rows_L=[] + for condition_ in conditions: + column_name, operator, value = self._parse_condition(condition_) + column = self.column_by_name(column_name) + rows_L.append([ind for ind, x in enumerate(column) if get_op(operator, x, value)]) + + + indexes_to_del = set(rows_L[0]).intersection(*rows_L) + indexes_to_del = list(indexes_to_del) + + else: + raise("invalid where condition") + ''' + # we pop from highest to lowest index in order to avoid removing the wrong item # since we dont delete, we dont have to to pop in that order, but since delete is used @@ -233,9 +434,93 @@ def _select_where(self, return_columns, condition=None, distinct=False, order_by # if condition is None, return all rows # if not, return the rows with values where condition is met for value if condition is not None: - column_name, operator, value = self._parse_condition(condition) - column = self.column_by_name(column_name) - rows = [ind for ind, x in enumerate(column) if get_op(operator, x, value)] + condition = self.replace_between(condition) + sub_conditions = condition.split(" or ") + + rows = set() + for sub_cond in sub_conditions: + is_not = False + if sub_cond.startswith("not "): + is_not = True + sub_cond = sub_cond[4:] + sub_cond=sub_cond.replace("( ", "").replace(" )", "") + + conditions = sub_cond.split(" and ") + rows_L=[] + for condition_ in conditions: + column_name, operator, value = self._parse_condition(condition_) + column = self.column_by_name(column_name) + rows_L.append([ind for ind, x in enumerate(column) if get_op(operator, x, value)]) + + and_rows = set(rows_L[0]).intersection(*rows_L) if rows_L else set() + + if is_not: + not_rows = set(range(len(column))) - and_rows + rows.update(not_rows) + else: + rows.update(and_rows) + #rows.update(self.find_rows(sub_cond)) + + + rows = list(rows) + ''' + if re.match(r"^\w+\s*(=|<=|>=|<|>|!=)\s*\w+$", condition) or condition.startswith("not "):#simple condition or not + column_name, operator, value = self._parse_condition(condition) + column = self.column_by_name(column_name) + rows = [ind for ind, x in enumerate(column) if get_op(operator, x, value)] + elif re.match(r"^\w+\s+between\s+\w+\s+and\s+\w+$", condition): + + query = condition.split() + index = query.index("between") + + #megalutero = query[index+1] + #mikrotero = query[index+3] + column_name = query[index-1] + + + + column = self.column_by_name(column_name) + for i in range (len(self.columns)): + if column_name==self.column_names[i]: + + megalutero = self.column_types[i](query[index+1]) + mikrotero = self.column_types[i](query[index+3]) + + rows = [] + for i,j in enumerate(column): + if j >= megalutero and j <= mikrotero: + rows.append(i) + elif re.match(r"^\w+\s*(=|<=|>=|<|>|!=)\s*\w+\s+or\s+\w+\s*(=|<=|>=|<|>|!=)\s*\w+$", condition): + + column_name = condition.split()[0] + conditions = condition.split(" or ") + rows_L=[] + for condition_ in conditions: + + column_name, operator, value = self._parse_condition(condition_) + column = self.column_by_name(column_name) + rows_L.append([ind for ind, x in enumerate(column) if get_op(operator, x, value)]) + + rows=[] + for rlist in rows_L: + for row in rlist: + rows.append(row) + elif re.match(r"^\w+\s*(=|<=|>=|<|>|!=)\s*\w+\s+and\s+\w+\s*(=|<=|>=|<|>|!=)\s*\w+$", condition): + column_name = condition.split()[0] + conditions = condition.split(" and ") + rows_L=[] + for condition_ in conditions: + column_name, operator, value = self._parse_condition(condition_) + column = self.column_by_name(column_name) + rows_L.append([ind for ind, x in enumerate(column) if get_op(operator, x, value)]) + + + rows = set(rows_L[0]).intersection(*rows_L) + rows = list(rows) + + else: + raise("invalid where condition") + ''' else: rows = [i for i in range(len(self.data))] @@ -270,8 +555,36 @@ def _select_where(self, return_columns, condition=None, distinct=False, order_by return s_table - def _select_where_with_btree(self, return_columns, bt, condition, distinct=False, order_by=None, desc=True, limit=None): + def replace_between(self,condition): + + query = condition.split() + try: + index = query.index("between") + except: + return condition + + + + + megalutero = query[index+1] + mikrotero = query[index+3] + column_name = query[index-1] + between_condition = str(column_name) + ">=" + str(megalutero) + " and " + str(column_name) + "<=" + str(mikrotero) + + del query[index-1:index+4] + query.insert(index-1, between_condition) + blank =" " + new_condition = blank.join(query) + + new_condition = self.replace_between(new_condition) + return new_condition + + + + + def _select_where_with_btree(self, return_columns, bt, condition, distinct=False, order_by=None, desc=True, limit=None): + print("select from btree") # if * return all columns, else find the column indexes for the columns specified if return_columns == '*': return_cols = [i for i in range(len(self.column_names))] @@ -279,14 +592,115 @@ def _select_where_with_btree(self, return_columns, bt, condition, distinct=False return_cols = [self.column_names.index(colname) for colname in return_columns] - column_name, operator, value = self._parse_condition(condition) + #column_name, operator, value = self._parse_condition(condition) + if condition is not None: + condition = self.replace_between(condition) + sub_conditions = condition.split(" or ") + + rows = set() + for sub_cond in sub_conditions: + is_not = False + if sub_cond.startswith("not "): + is_not = True + sub_cond = sub_cond[4:] + sub_cond=sub_cond.replace("( ", "").replace(" )", "") + + conditions = sub_cond.split(" and ") + rows_L=[] + for condition_ in conditions: + column_name, operator, value = self._parse_condition(condition_) + column = self.column_by_name(column_name) + rows_L.append(bt.find(operator, value)) + + + + and_rows = set(rows_L[0]).intersection(*rows_L) if rows_L else set() + + if is_not: + not_rows = set(range(len(column))) - and_rows + rows.update(not_rows) + else: + rows.update(and_rows) + + + + rows = list(rows) + + + '''if re.match(r"^\w+\s*(=|<=|>=|<|>|!=)\s*\w+$", condition) or condition.startswith("not "):#simple condition or not + column_name, operator, value = self._parse_condition(condition) + column = self.column_by_name(column_name) + if (self.pk_idx is not None and column_name != self.column_names[self.pk_idx]) and (self.unique is not None and column_name not in self.unique): + #print('Column is not PK. Aborting') + raise ValueError('Column is not PK or UNIQUE') + rows1 = [] + opsseq = 0 + for ind, x in enumerate(column): + opsseq+=1 + if get_op(operator, x, value): + rows1.append(ind) + rows = bt.find(operator, value) + + + elif re.match(r"^\w+\s+between\s+\w+\s+and\s+\w+$", condition): + + query = condition.split() + index = query.index("between") + + column_name = query[index-1] + if (self.pk_idx is not None and column_name != self.column_names[self.pk_idx]) and (self.unique is not None and column_name not in self.unique): + #print('Column is not PK. Aborting') + raise ValueError('Column is not PK or UNIQUE') + column = self.column_by_name(column_name) + for i in range (len(self.columns)): + if column_name==self.column_names[i]: + + megalutero = self.column_types[i](query[index+1]) + mikrotero = self.column_types[i](query[index+3]) + rows_greater = bt.find('>=',str(megalutero)) + rows_less = bt.find('<=',str(mikrotero)) + rows = set(rows_greater).intersection(rows_less) + rows = list(rows) + + elif re.match(r"^\w+\s*(=|<=|>=|<|>|!=)\s*\w+\s+or\s+\w+\s*(=|<=|>=|<|>|!=)\s*\w+$", condition): + + print(condition) + conditions = condition.split(" or ") + rows_L=[] + for condition_ in conditions: + + column_name, operator, value = self._parse_condition(condition_) + if (self.pk_idx is not None and column_name != self.column_names[self.pk_idx]) and (self.unique is not None and column_name not in self.unique): + #print('Column is not PK. Aborting') + raise ValueError('Column is not PK or UNIQUE') + + rows_L.append(bt.find(operator, value)) + rows=[] + print(rows_L) + for rlist in rows_L: + for row in rlist: + rows.append(row) + elif re.match(r"^\w+\s*(=|<=|>=|<|>|!=)\s*\w+\s+and\s+\w+\s*(=|<=|>=|<|>|!=)\s*\w+$", condition): + + conditions = condition.split(" and ") + rows_L=[] + for condition_ in conditions: + column_name, operator, value = self._parse_condition(condition_) + if (self.pk_idx is not None and column_name != self.column_names[self.pk_idx]) and (self.unique is not None and column_name not in self.unique): + #print('Column is not PK. Aborting') + continue + column = self.column_by_name(column_name) + rows_L.append(bt.find(operator, value)) + + rows = set(rows_L[0]).intersection(*rows_L) + rows = list(rows)''' # if the column in condition is not a primary key, abort the select - if column_name != self.column_names[self.pk_idx]: - print('Column is not PK. Aborting') + # here we run the same select twice, sequentially and using the btree. # we then check the results match and compare performance (number of operation) + ''' column = self.column_by_name(column_name) # sequential @@ -299,13 +713,14 @@ def _select_where_with_btree(self, return_columns, bt, condition, distinct=False # btree find rows = bt.find(operator, value) - + ''' try: k = int(limit) except TypeError: k = None # same as simple select from now on rows = rows[:k] + # TODO: this needs to be dumbed down dict = {(key):([[self.data[i][j] for j in return_cols] for i in rows] if key=="data" else value) for key,value in self.__dict__.items()} @@ -321,9 +736,53 @@ def _select_where_with_btree(self, return_columns, bt, condition, distinct=False if isinstance(limit,str): s_table.data = [row for row in s_table.data if row is not None][:int(limit)] + + return s_table + + def _select_where_with_hash(self, return_columns, bt, condition, distinct=False, order_by=None, desc=True, limit=None): + print("select with hash") + # if * return all columns, else find the column indexes for the columns specified + if return_columns == '*': + return_cols = [i for i in range(len(self.column_names))] + else: + return_cols = [self.column_names.index(colname) for colname in return_columns] + + + column_name, operator, value = self._parse_condition(condition) + rows=[] + rows.append(bt.find(value)) + + + + + try: + k = int(limit) + except TypeError: + k = None + # same as simple select from now on + rows = rows[:k] + rows=list(rows) + print(rows) + + # TODO: this needs to be dumbed down + dict = {(key):([[self.data[i][j] for j in return_cols] for i in rows] if key=="data" else value) for key,value in self.__dict__.items()} + + dict['column_names'] = [self.column_names[i] for i in return_cols] + dict['column_types'] = [self.column_types[i] for i in return_cols] + s_table = Table(load=dict) + + s_table.data = list(set(map(lambda x: tuple(x), s_table.data))) if distinct else s_table.data + + if order_by: + s_table.order_by(order_by, desc) + + if isinstance(limit,str): + s_table.data = [row for row in s_table.data if row is not None][:int(limit)] + return s_table + def order_by(self, column_name, desc=True): ''' Order table based on column. @@ -533,6 +992,10 @@ def show(self, no_of_rows=None, is_locked=False): if self.pk_idx is not None: # table has a primary key, add PK next to the appropriate column headers[self.pk_idx] = headers[self.pk_idx]+' #PK#' + + if self.unique_idx is not None: + for unq in self.unique_idx: + headers[unq]=headers[unq]+' #UNQ#' # detect the rows that are no tfull of nones (these rows have been deleted) # if we dont skip these rows, the returning table has empty rows at the deleted positions non_none_rows = [row for row in self.data if any(row)] @@ -558,11 +1021,18 @@ def _parse_condition(self, condition, join=False): # cast the value with the specified column's type and return the column name, the operator and the casted value left, op, right = split_condition(condition) - if left not in self.column_names: + if right in self.column_names:#'value[<,<=,==,>=,>]column' fromat + coltype = self.column_types[self.column_names.index(right)] + op =reverse_op(op) + return right, op, coltype(left) + + elif left in self.column_names:#'column[<,<=,==,>=,>]value' format + coltype = self.column_types[self.column_names.index(left)] + + return left, op, coltype(right) + else: + #raise ValueError(f'Condition is not valid (cant find column name)') raise ValueError(f'Condition is not valid (cant find column name)') - coltype = self.column_types[self.column_names.index(left)] - - return left, op, coltype(right) def _load_from_file(self, filename):