diff --git a/Orange/data/io.py b/Orange/data/io.py index 0959bb725c2..c357e65ae1c 100644 --- a/Orange/data/io.py +++ b/Orange/data/io.py @@ -14,15 +14,17 @@ from os import path, remove from tempfile import NamedTemporaryFile from urllib.parse import urlparse, urlsplit, urlunsplit, \ - unquote as urlunquote, quote + unquote as urlunquote from urllib.request import urlopen, Request from pathlib import Path import numpy as np +import pandas as pd import xlrd import xlsxwriter import openpyxl +import h5py from Orange.data import _io, Table, Domain, ContinuousVariable, update_origin from Orange.data import Compression, open_compressed, detect_encoding, \ @@ -31,7 +33,6 @@ from Orange.util import flatten - # Support values longer than 128K (i.e. text contents features) csv.field_size_limit(100*1024*1024) @@ -164,7 +165,14 @@ def read(self): skipinitialspace=True, ) data = self.data_table(reader) - data.name = path.splitext(path.split(self.filename)[-1])[0] + + # ToDO: Name can be set unconditionally when/if + # self.filename will always be a string with the file name. + # Currently, some tests pass StringIO instead of + # the file name to a reader. + if isinstance(self.filename, str): + data.name = path.splitext( + path.split(self.filename)[-1])[0] if error and isinstance(error, UnicodeDecodeError): pos, endpos = error.args[2], error.args[3] warning = ('Skipped invalid byte(s) in position ' @@ -511,3 +519,102 @@ def _suggest_filename(self, content_disposition): matches = re.findall(r"filename\*?=(?:\"|.{0,10}?'[^']*')([^\"]+)", content_disposition or '') return urlunquote(matches[-1]) if matches else default_name + + +class GenericHDF5Reader(FileFormat): + """ + Class in charge to read and write generic .hdf5 files + + Parameters + ---------- + data (h5py._hl.dataset.Dataset): Chosen dataset to read by the class + + Methods + ------- + read(): + Returns transforms its data attribute into an Orange.Table object + """ + EXTENSIONS = ('.hdf5', '.h5', '.nxs',) + DESCRIPTION = 'Hierarchical Data Format files' + SUPPORT_COMPRESSED = False + SUPPORT_SPARSE_DATA = False + + def __init__(self, filename): + super().__init__(filename=filename) + + self.h5_file = h5py.File(filename) + + self.datasets = {} + self._load_group("/", self.h5_file) + + @property + def sheets(self) -> List: + """List of datasets in the file. + + Returns + ------- + List of dataset paths + """ + return list(self.datasets.keys()) + + def select_sheet(self, sheet): + """Select dataset to be read + + Parameters + ---------- + sheet : str + dataset path + """ + if sheet is None: + sheet = self.sheets[0] + self.sheet = sheet + + def read(self): + """Process data stored in self.data and returns it as an Orange + Table object. + + Returns + ------- + table (Orange.Table object): + Contains the information of the chosen dataset in the hdf5 file. + """ + + if self.sheet is not None: + name = self.sheet.split('/')[-1] + else: + name = "Data" + + data = self.datasets[self.sheet] + + # Standard names for the columns of the dataset, can be changed manually + # in the widget itself + columns = [str(i) for i in range(len(data.shape))] + + dataset = np.array(data) + + # Indexs are created to keep track of the position of the values in the + # original data file + index = pd.MultiIndex.from_product([range(s) for s in dataset.shape], names=columns) + dataset = dataset.flatten() + + # Combines the values and the indexes in a readable 2d structure + df = pd.DataFrame({name : dataset}, index=index).reset_index() + + attrs = [ContinuousVariable(str(val)) for val in range(0, len(df.columns))] + table = Table.from_numpy(domain=Domain(attributes=attrs), X=df.values) + + return table + + def _load_group(self, root, group): + """Recursive procedure that constructs the list of datasets + stored in the .hdf5 file. + + Given a root, iterates over all its children to decide whether + they are a dataset or another group of data. + """ + for name, obj in group.items(): + path = root + name + if isinstance(obj, h5py.Group): + self._load_group(path + "/", group[name]) + elif isinstance(obj, h5py.Dataset): + self.datasets[path] = obj diff --git a/Orange/widgets/data/owfile.py b/Orange/widgets/data/owfile.py index 02570b84e69..40508a5a499 100644 --- a/Orange/widgets/data/owfile.py +++ b/Orange/widgets/data/owfile.py @@ -15,7 +15,8 @@ from orangewidget.workflow.drophandler import SingleUrlDropHandler from Orange.data.table import Table, get_sample_datasets_dir -from Orange.data.io import FileFormat, UrlReader, class_from_qualified_name +from Orange.data.io import FileFormat, UrlReader, \ + class_from_qualified_name, GenericHDF5Reader from Orange.data.io_base import MissingReaderException from Orange.util import log_warnings from Orange.widgets import widget, gui @@ -46,7 +47,7 @@ def add_origin(examples, filename): """ Adds attribute with file location to each string variable Used for relative filenames stored in string variables (e.g. pictures) - TODO: we should consider a cleaner solution (special variable type, ...) + ToDO: we should consider a cleaner solution (special variable type, ...) """ if not filename: return @@ -268,6 +269,14 @@ def package(w): box.layout().addWidget(self.reader_combo) layout.addWidget(box, 0, 1) + # Set an options box for special types of files that require more + # specifications before loading the Orange.table + self.options_box = gui.widgetBox(self.controlArea, + orientation=QGridLayout().setSpacing(4), + box="Options") + # Hide the box until needed + self.options_box.hide() + box = gui.vBox(self.controlArea, "Info") self.infolabel = gui.widgetLabel(box, 'No data loaded.') @@ -282,6 +291,7 @@ def package(w): autoDefault=False ) gui.rubber(box) + self.apply_button = gui.button( box, self, "Apply", callback=self.apply_domain_edit) self.apply_button.setEnabled(False) @@ -452,7 +462,7 @@ def mark_problematic_reader(): self.data = data self.openContext(data.domain) self.apply_domain_edit() # sends data - return None + return None def _get_reader(self) -> FileFormat: if self.source == self.LOCAL_FILE: @@ -483,6 +493,7 @@ def _get_reader(self) -> FileFormat: url = self.url_combo.currentText().strip() return UrlReader(url) + def _update_sheet_combo(self): if len(self.reader.sheets) < 2: self.sheet_box.hide() diff --git a/doc/visual-programming/source/widgets/data/pythonscript.md b/doc/visual-programming/source/widgets/data/pythonscript.md index 6c91b1bca28..aca7c1e20c7 100644 --- a/doc/visual-programming/source/widgets/data/pythonscript.md +++ b/doc/visual-programming/source/widgets/data/pythonscript.md @@ -17,7 +17,7 @@ Extends functionalities through Python scripting. - Classifier (Orange.classification.Learner): classifier retrieved from ``out_classifier`` variable - Object: Python object retrieved from ``out_object`` variable -**Python Script** widget can be used to run a python script in the input, when a suitable functionality is not implemented in an existing widget. The script has ``in_data``, ``in_distance``, ``in_learner``, ``in_classifier`` and ``in_object`` variables (from input signals) in its local namespace. If a signal is not connected or it did not yet receive any data, those variables contain ``None``. +**Python Script** widget can be used to run a python script in the input, when a suitable functionality is not implemented in an existing widget. The script has ``in_data``, ``in_distance``, ``in_learner``, ``in_classifier`` and ``in_object`` variables (from input signals) in its local namespace. If a signal is not connected or it did not yet receive any data, those variables contain ``None``. For the case when multiple inputs are connected to the widget, the lists ``in_datas``, ``in_distances``, ``in_learners``, ``in_classifiers`` and ``in_objects`` may be used instead. After the script is executed variables from the script’s local namespace are extracted and used as outputs of the widget. The widget can be further connected to other widgets for visualizing the output. diff --git a/pyproject.toml b/pyproject.toml index 1d45cca96f5..5278ac3f258 100644 --- a/pyproject.toml +++ b/pyproject.toml @@ -6,6 +6,7 @@ requires = [ "setuptools>=51.0", "sphinx", "wheel", + "h5py", ] build-backend = "setuptools.build_meta" diff --git a/requirements-core.txt b/requirements-core.txt index db1c8d514f2..ceceeab8c84 100644 --- a/requirements-core.txt +++ b/requirements-core.txt @@ -26,3 +26,5 @@ xgboost>=1.7.4 xlrd>=1.2.0 # Writing Excel Files xlsxwriter +# HDF5 binary data format +h5py