From a569e4ccc9ec9546d08dd8e05515fd6fa13735da Mon Sep 17 00:00:00 2001 From: "Youngsook.Kim" Date: Thu, 16 Apr 2026 14:43:35 +0000 Subject: [PATCH 1/5] Initial commit --- .gitignore | 50 ++++++++++++++++++++++++++++++++++++++++++++++++++ 1 file changed, 50 insertions(+) create mode 100644 .gitignore diff --git a/.gitignore b/.gitignore new file mode 100644 index 0000000..b24d71e --- /dev/null +++ b/.gitignore @@ -0,0 +1,50 @@ +# These are some examples of commonly ignored file patterns. +# You should customize this list as applicable to your project. +# Learn more about .gitignore: +# https://www.atlassian.com/git/tutorials/saving-changes/gitignore + +# Node artifact files +node_modules/ +dist/ + +# Compiled Java class files +*.class + +# Compiled Python bytecode +*.py[cod] + +# Log files +*.log + +# Package files +*.jar + +# Maven +target/ +dist/ + +# JetBrains IDE +.idea/ + +# Unit test reports +TEST*.xml + +# Generated by MacOS +.DS_Store + +# Generated by Windows +Thumbs.db + +# Applications +*.app +*.exe +*.war + +# Large media files +*.mp4 +*.tiff +*.avi +*.flv +*.mov +*.wmv + From c413a4a08fdd6fd1b4f6a3b25c5935f0c702d24e Mon Sep 17 00:00:00 2001 From: "Youngsook.Kim" Date: Thu, 16 Apr 2026 15:13:09 +0000 Subject: [PATCH 2/5] first commit --- CMakeFiles/CMakeSystem.cmake | 15 ++++++++ lib/CMakeLists.txt | 32 ++++++++++++++++- lib/src/rdswrapper.cpp | 69 ++++++++++++++++++++---------------- setup.cfg | 2 +- setup.py | 54 +++++++++++++++++----------- 5 files changed, 119 insertions(+), 53 deletions(-) create mode 100644 CMakeFiles/CMakeSystem.cmake diff --git a/CMakeFiles/CMakeSystem.cmake b/CMakeFiles/CMakeSystem.cmake new file mode 100644 index 0000000..1a46f66 --- /dev/null +++ b/CMakeFiles/CMakeSystem.cmake @@ -0,0 +1,15 @@ +set(CMAKE_HOST_SYSTEM "Windows") +set(CMAKE_HOST_SYSTEM_NAME "Windows") +set(CMAKE_HOST_SYSTEM_VERSION "") +set(CMAKE_HOST_SYSTEM_PROCESSOR "AMD64") + + + +set(CMAKE_SYSTEM "Windows") +set(CMAKE_SYSTEM_NAME "Windows") +set(CMAKE_SYSTEM_VERSION "") +set(CMAKE_SYSTEM_PROCESSOR "AMD64") + +set(CMAKE_CROSSCOMPILING "FALSE") + +set(CMAKE_SYSTEM_LOADED 1) diff --git a/lib/CMakeLists.txt b/lib/CMakeLists.txt index 14470b7..71337a7 100644 --- a/lib/CMakeLists.txt +++ b/lib/CMakeLists.txt @@ -3,11 +3,36 @@ cmake_minimum_required(VERSION 3.24) project(rds2py VERSION 1.0.0 DESCRIPTION "Building the rds shared library" - LANGUAGES CXX) + LANGUAGES C CXX) # Importing all of the dependencies with pinned versions (even for transitive dependencies). include(FetchContent) +# On Windows (or when zlib is not found), build zlib from source so that +# byteme can use GzipFileReader / GzipFileWriter. +find_package(ZLIB QUIET) +if(NOT ZLIB_FOUND) + message(STATUS "System zlib not found -- building zlib from source via FetchContent") + FetchContent_Declare( + zlib + GIT_REPOSITORY https://github.com/madler/zlib + GIT_TAG v1.3.1 + ) + set(ZLIB_BUILD_EXAMPLES OFF CACHE BOOL "" FORCE) + FetchContent_MakeAvailable(zlib) + + # Make the zlib headers visible to downstream targets and set the + # ZLIB_FOUND / ZLIB::ZLIB variables that byteme's CMakeLists.txt looks for. + add_library(ZLIB::ZLIB ALIAS zlibstatic) + target_include_directories(zlibstatic PUBLIC + "${zlib_SOURCE_DIR}" + "${zlib_BINARY_DIR}" + ) + set(ZLIB_FOUND TRUE CACHE BOOL "" FORCE) + set(ZLIB_LIBRARIES zlibstatic CACHE STRING "" FORCE) + set(ZLIB_INCLUDE_DIRS "${zlib_SOURCE_DIR};${zlib_BINARY_DIR}" CACHE STRING "" FORCE) +endif() + FetchContent_Declare( rds2cpp GIT_REPOSITORY https://github.com/LTLA/rds2cpp @@ -44,6 +69,11 @@ set_property(TARGET ${TARGET} PROPERTY CXX_STANDARD 17) target_link_libraries(${TARGET} PRIVATE rds2cpp pybind11::pybind11) +# On Windows, also link zlib so the gzip symbols are available at link time. +if(NOT ZLIB_FOUND OR TARGET zlibstatic) + target_link_libraries(${TARGET} PRIVATE ZLIB::ZLIB) +endif() + set_target_properties(${TARGET} PROPERTIES OUTPUT_NAME lib_rds_parser PREFIX "" diff --git a/lib/src/rdswrapper.cpp b/lib/src/rdswrapper.cpp index 2ed6aa2..c5f40f8 100644 --- a/lib/src/rdswrapper.cpp +++ b/lib/src/rdswrapper.cpp @@ -3,6 +3,7 @@ #include #include #include +#include #include namespace py = pybind11; @@ -10,15 +11,16 @@ namespace py = pybind11; class RdsReader { private: const rds2cpp::RObject* ptr; + const std::vector* symbols; public: - RdsReader(const rds2cpp::RObject* p) : ptr(p) { + RdsReader(const rds2cpp::RObject* p, const std::vector* syms) : ptr(p), symbols(syms) { if (!p) throw std::runtime_error("Null pointer passed to 'RdsReader'."); + if (!syms) throw std::runtime_error("Null symbols pointer passed to 'RdsReader'."); } std::string get_rtype() const { if (!ptr) throw std::runtime_error("Null pointer in 'get_rtype'."); - // py::print("arg::", static_cast(ptr->type())); switch (ptr->type()) { case rds2cpp::SEXPType::S4: return "S4"; case rds2cpp::SEXPType::INT: return "integer"; @@ -69,23 +71,36 @@ class RdsReader { throw std::runtime_error("Invalid type for 'string_arr'"); } const auto& data = static_cast(ptr)->data; - return py::cast(data); + py::list result; + for (const auto& s : data) { + if (s.value.has_value()) { + result.append(py::str(s.value.value())); + } else { + result.append(py::none()); + } + } + return result; } py::list get_attribute_names() const { if (!ptr) throw std::runtime_error("Null pointer in 'get_attribute_names'"); - return py::cast(get_attributes().names); + const auto& attrs = get_attributes(); + py::list names; + for (const auto& attr : attrs) { + names.append((*symbols)[attr.name.index].name); + } + return names; } py::object load_attribute_by_name(const std::string& name) const { if (!ptr) throw std::runtime_error("Null pointer in 'load_attribute_by_name'"); - const auto& attributes = get_attributes(); - auto it = std::find(attributes.names.begin(), attributes.names.end(), name); - if (it == attributes.names.end()) { - throw std::runtime_error("Attribute not found: " + name); + const auto& attrs = get_attributes(); + for (const auto& attr : attrs) { + if ((*symbols)[attr.name.index].name == name) { + return py::cast(new RdsReader(attr.value.get(), symbols)); + } } - size_t index = std::distance(attributes.names.begin(), it); - return py::cast(new RdsReader(attributes.values[index].get())); + throw std::runtime_error("Attribute not found: " + name); } py::object load_vec_element(int index) const { @@ -97,7 +112,7 @@ class RdsReader { if (index < 0 || static_cast(index) >= data.size()) { throw std::out_of_range("Vector index out of range"); } - return py::cast(new RdsReader(data[index].get())); + return py::cast(new RdsReader(data[index].get(), symbols)); } std::string get_package_name() const { @@ -126,7 +141,7 @@ class RdsReader { } private: - const rds2cpp::Attributes& get_attributes() const { + const std::vector& get_attributes() const { if (!ptr) throw std::runtime_error("Null pointer in get_attributes"); switch (ptr->type()) { case rds2cpp::SEXPType::INT: return static_cast(ptr)->attributes; @@ -142,18 +157,18 @@ class RdsReader { class RdsObject { private: - std::unique_ptr parsed; + std::unique_ptr parsed; std::unique_ptr reader; public: RdsObject(const std::string& file) { try { rds2cpp::ParseRdsOptions options; - parsed = std::make_unique(rds2cpp::parse_rds(file, options)); + parsed = std::make_unique(rds2cpp::parse_rds(file, options)); if (!parsed || !parsed->object) { throw std::runtime_error("Failed to parse RDS file"); } - reader = std::make_unique(parsed->object.get()); + reader = std::make_unique(parsed->object.get(), &parsed->symbols); } catch (const std::exception& e) { throw std::runtime_error(std::string("Error in 'RdsObject' constructor: ") + e.what()); } @@ -181,38 +196,31 @@ class RdaObject { py::list get_object_names() const { if (!parsed) throw std::runtime_error("Null parsed in 'get_object_names'"); - const auto& pairlist = parsed->contents; py::list names; - for (size_t i = 0; i < pairlist.tag_names.size(); ++i) { - if (pairlist.has_tag[i]) { - names.append(pairlist.tag_names[i]); - } else { - names.append(py::none()); - } + for (const auto& obj : parsed->objects) { + names.append(parsed->symbols[obj.name.index].name); } return names; } int get_object_count() const { if (!parsed) throw std::runtime_error("Null parsed in 'get_object_count'"); - return static_cast(parsed->contents.data.size()); + return static_cast(parsed->objects.size()); } RdsReader* get_object_by_index(int index) const { if (!parsed) throw std::runtime_error("Null parsed in 'get_object_by_index'"); - const auto& data = parsed->contents.data; - if (index < 0 || static_cast(index) >= data.size()) { + if (index < 0 || static_cast(index) >= parsed->objects.size()) { throw std::out_of_range("Object index out of range"); } - return new RdsReader(data[index].get()); + return new RdsReader(parsed->objects[index].value.get(), &parsed->symbols); } RdsReader* get_object_by_name(const std::string& name) const { if (!parsed) throw std::runtime_error("Null parsed in 'get_object_by_name'"); - const auto& pairlist = parsed->contents; - for (size_t i = 0; i < pairlist.tag_names.size(); ++i) { - if (pairlist.has_tag[i] && pairlist.tag_names[i] == name) { - return new RdsReader(pairlist.data[i].get()); + for (const auto& obj : parsed->objects) { + if (parsed->symbols[obj.name.index].name == name) { + return new RdsReader(obj.value.get(), &parsed->symbols); } } throw std::runtime_error("Object not found: " + name); @@ -234,7 +242,6 @@ PYBIND11_MODULE(lib_rds_parser, m) { .def("get_object_by_name", &RdaObject::get_object_by_name, py::return_value_policy::take_ownership, py::keep_alive<0, 1>()); py::class_(m, "RdsReader") - .def(py::init()) .def("get_rtype", &RdsReader::get_rtype) .def("get_rsize", &RdsReader::get_rsize) .def("get_numeric_data", &RdsReader::get_numeric_data) diff --git a/setup.cfg b/setup.cfg index c7dbc2a..94469fc 100644 --- a/setup.cfg +++ b/setup.cfg @@ -24,7 +24,7 @@ project_urls = # Twitter = https://twitter.com/PyScaffold # Change if running only on Windows, Mac or Linux (comma-separated) -platforms = Mac, Linux +platforms = Mac, Linux, Windows # Add here all kinds of additional classifiers as defined under # https://pypi.org/classifiers/ diff --git a/setup.py b/setup.py index 8da113f..07edeb2 100644 --- a/setup.py +++ b/setup.py @@ -7,6 +7,7 @@ from setuptools import setup, Extension from setuptools.command.build_ext import build_ext as build_ext_orig +import glob import pathlib import os import shutil @@ -33,23 +34,26 @@ def build_cmake(self, ext): outpath = os.path.join(build_lib.absolute(), ext.name) build_temp = os.path.join(build_temp, "build") - if not os.path.exists(build_temp): - cmd = [ - "cmake", - "-S", - "lib", - "-B", - build_temp, - "-Dpybind11_DIR=" + os.path.join(os.path.dirname(pybind11.__file__), "share", "cmake", "pybind11"), - "-DPYTHON_EXECUTABLE=" + sys.executable, - ] - if os.name != "nt": - cmd.append("-DCMAKE_BUILD_TYPE=Release") - cmd.append("-DCMAKE_LIBRARY_OUTPUT_DIRECTORY=" + outpath) + # Remove stale build dir to avoid FetchContent conflicts with + # leftover _deps from a previous pip build environment. + if os.path.exists(build_temp): + shutil.rmtree(build_temp, ignore_errors=True) + cmd = [ + "cmake", + "-S", + "lib", + "-B", + build_temp, + "-Dpybind11_DIR=" + os.path.join(os.path.dirname(pybind11.__file__), "share", "cmake", "pybind11"), + "-DPYTHON_EXECUTABLE=" + sys.executable, + ] + if os.name != "nt": + cmd.append("-DCMAKE_BUILD_TYPE=Release") + cmd.append("-DCMAKE_LIBRARY_OUTPUT_DIRECTORY=" + outpath) - if "MORE_CMAKE_OPTIONS" in os.environ: - cmd += os.environ["MORE_CMAKE_OPTIONS"].split() - self.spawn(cmd) + if "MORE_CMAKE_OPTIONS" in os.environ: + cmd += os.environ["MORE_CMAKE_OPTIONS"].split() + self.spawn(cmd) if not self.dry_run: cmd = ["cmake", "--build", build_temp] @@ -59,10 +63,20 @@ def build_cmake(self, ext): if os.name == "nt": # Gave up trying to get MSVC to respect the output directory. # Delvewheel also needs it to have a 'pyd' suffix... whatever. - shutil.copyfile( - os.path.join(build_temp, "Release", "_core.dll"), - os.path.join(outpath, "_core.pyd"), - ) + # The CMake target name is lib_rds_parser; MSVC puts it under Release/. + # pybind11 may add an ABI tag (e.g. lib_rds_parser.cp312-win_amd64.pyd). + release_dir = os.path.join(build_temp, "Release") + candidates = glob.glob(os.path.join(release_dir, "lib_rds_parser*")) + if not candidates: + raise RuntimeError( + f"Cannot find compiled library in {release_dir}. " + f"Contents: {os.listdir(release_dir) if os.path.isdir(release_dir) else 'dir not found'}" + ) + # Prefer .pyd files over .lib/.exp + pyd_files = [c for c in candidates if c.endswith(".pyd")] + src_name = pyd_files[0] if pyd_files else candidates[0] + os.makedirs(outpath, exist_ok=True) + shutil.copyfile(src_name, os.path.join(outpath, "lib_rds_parser.pyd")) if __name__ == "__main__": From 16b4f16827e36060d618b18c500327aaabe735af Mon Sep 17 00:00:00 2001 From: "Youngsook.Kim" Date: Thu, 16 Apr 2026 16:12:29 +0000 Subject: [PATCH 3/5] update readme --- README.md | 23 +++++++++++++++++++++++ 1 file changed, 23 insertions(+) diff --git a/README.md b/README.md index 74f419a..e226598 100644 --- a/README.md +++ b/README.md @@ -6,6 +6,29 @@ Parse and construct Python representations for datasets stored in **RDS or RData** files. `rds2py` supports various base classes from R, and Bioconductor's `SummarizedExperiment` and `SingleCellExperiment` S4 classes. **_For more details, check out [rds2cpp library](https://github.com/LTLA/rds2cpp)._** +## Fixes +Cloned from [rds2py](https://github.com/BiocPy/rds2py). The repo can't be installed on Windows. With a AI agent, a few fixes are made to make it installalbe on Windows. + +rds2py now builds and works on Windows. Here's a summary of the four issues that waere fixed: + +1. lib/src/rdswrapper.cpp — Updated for new rds2cpp API +The upstream rds2cpp library had breaking API changes: + +Attributes: Changed from a struct with .names/.values vectors to std::vector where each Attribute has a SymbolIndex name + unique_ptr value. Attribute names now require a lookup into a global symbols table. +StringVector::data: Changed from vector to vector where String has an optional value (supports missing/NA strings). +RdaFile: contents (a PairList-like struct with .tag_names, .has_tag, .data) was replaced by objects (a vector with .name as SymbolIndex and .value). +The RdsReader class now carries a const std::vector* pointer to resolve symbol names. Added #include . + +2. lib/CMakeLists.txt — Added zlib for Windows +The byteme library conditionally includes gzip support only when zlib.h is available (#if __has_include("zlib.h")). On Windows, zlib isn't typically installed. Added CMake logic to auto-fetch and build zlib from source via FetchContent when the system zlib isn't found. + +3. setup.py — Fixed Windows file copy +The old code copied _core.dll → _core.pyd, but the actual CMake output is named lib_rds_parser. +pybind11 appends ABI tags (e.g. lib_rds_parser.cp312-win_amd64.pyd), so a glob pattern is now used to find the built file. +4. setup.cfg — Added Windows platform +Changed platforms = Mac, Linux to platforms = Mac, Linux, Windows. + + ## Installation Package is published to [PyPI](https://pypi.org/project/rds2py/) From faff3dbf33218a351249deb14a48457bc1031388 Mon Sep 17 00:00:00 2001 From: "Youngsook.Kim" Date: Thu, 16 Apr 2026 16:14:00 +0000 Subject: [PATCH 4/5] update readme --- README.md | 5 +---- 1 file changed, 1 insertion(+), 4 deletions(-) diff --git a/README.md b/README.md index e226598..a91ba88 100644 --- a/README.md +++ b/README.md @@ -7,13 +7,10 @@ Parse and construct Python representations for datasets stored in **RDS or RData** files. `rds2py` supports various base classes from R, and Bioconductor's `SummarizedExperiment` and `SingleCellExperiment` S4 classes. **_For more details, check out [rds2cpp library](https://github.com/LTLA/rds2cpp)._** ## Fixes -Cloned from [rds2py](https://github.com/BiocPy/rds2py). The repo can't be installed on Windows. With a AI agent, a few fixes are made to make it installalbe on Windows. - -rds2py now builds and works on Windows. Here's a summary of the four issues that waere fixed: +Cloned from [rds2py](https://github.com/BiocPy/rds2py). The repo can't be installed on Windows. With a AI agent, a few fixes are made to make it installable on Windows. Here's a summary of the four issues that were fixed: 1. lib/src/rdswrapper.cpp — Updated for new rds2cpp API The upstream rds2cpp library had breaking API changes: - Attributes: Changed from a struct with .names/.values vectors to std::vector where each Attribute has a SymbolIndex name + unique_ptr value. Attribute names now require a lookup into a global symbols table. StringVector::data: Changed from vector to vector where String has an optional value (supports missing/NA strings). RdaFile: contents (a PairList-like struct with .tag_names, .has_tag, .data) was replaced by objects (a vector with .name as SymbolIndex and .value). From bb40b1c105e992d74140227227b28d700b3abd63 Mon Sep 17 00:00:00 2001 From: "Youngsook.Kim" Date: Wed, 22 Apr 2026 22:08:33 -0400 Subject: [PATCH 5/5] revert README.md to upstream version --- README.md | 20 -------------------- 1 file changed, 20 deletions(-) diff --git a/README.md b/README.md index a91ba88..74f419a 100644 --- a/README.md +++ b/README.md @@ -6,26 +6,6 @@ Parse and construct Python representations for datasets stored in **RDS or RData** files. `rds2py` supports various base classes from R, and Bioconductor's `SummarizedExperiment` and `SingleCellExperiment` S4 classes. **_For more details, check out [rds2cpp library](https://github.com/LTLA/rds2cpp)._** -## Fixes -Cloned from [rds2py](https://github.com/BiocPy/rds2py). The repo can't be installed on Windows. With a AI agent, a few fixes are made to make it installable on Windows. Here's a summary of the four issues that were fixed: - -1. lib/src/rdswrapper.cpp — Updated for new rds2cpp API -The upstream rds2cpp library had breaking API changes: -Attributes: Changed from a struct with .names/.values vectors to std::vector where each Attribute has a SymbolIndex name + unique_ptr value. Attribute names now require a lookup into a global symbols table. -StringVector::data: Changed from vector to vector where String has an optional value (supports missing/NA strings). -RdaFile: contents (a PairList-like struct with .tag_names, .has_tag, .data) was replaced by objects (a vector with .name as SymbolIndex and .value). -The RdsReader class now carries a const std::vector* pointer to resolve symbol names. Added #include . - -2. lib/CMakeLists.txt — Added zlib for Windows -The byteme library conditionally includes gzip support only when zlib.h is available (#if __has_include("zlib.h")). On Windows, zlib isn't typically installed. Added CMake logic to auto-fetch and build zlib from source via FetchContent when the system zlib isn't found. - -3. setup.py — Fixed Windows file copy -The old code copied _core.dll → _core.pyd, but the actual CMake output is named lib_rds_parser. -pybind11 appends ABI tags (e.g. lib_rds_parser.cp312-win_amd64.pyd), so a glob pattern is now used to find the built file. -4. setup.cfg — Added Windows platform -Changed platforms = Mac, Linux to platforms = Mac, Linux, Windows. - - ## Installation Package is published to [PyPI](https://pypi.org/project/rds2py/)