diff --git a/.github/workflows/greetings.yml b/.github/workflows/greetings.yml index 39c66f9..062f7ff 100644 --- a/.github/workflows/greetings.yml +++ b/.github/workflows/greetings.yml @@ -9,8 +9,8 @@ jobs: issues: write pull-requests: write steps: - - uses: actions/first-interaction@v1 + - uses: actions/first-interaction@v1.3.0 with: repo-token: ${{ secrets.GITHUB_TOKEN }} - issue-message: "Thanks for contributing! Please follow the guidelines.' first issue" + issue-message: "Thanks for contributing! Please follow the guidelines." pr-message: "Thanks for contributing! Please follow the guidelines." diff --git a/.github/workflows/tests.yml b/.github/workflows/tests.yml index 741d393..61aa05a 100644 --- a/.github/workflows/tests.yml +++ b/.github/workflows/tests.yml @@ -21,29 +21,24 @@ jobs: python-version: ["3.9", "3.10", "3.11"] steps: - - uses: actions/checkout@v3 + - uses: actions/checkout@v4 - name: Set up Python ${{ matrix.python-version }} - uses: actions/setup-python@v3 + uses: actions/setup-python@v5 with: python-version: ${{ matrix.python-version }} - - name: Install dependencies + - name: Install poetry run: | - python -m pip install --upgrade pip - pip install -r requirements.txt - python -m pip install flake8 pytest + curl -sSL https://install.python-poetry.org | python3 - + echo "$HOME/.local/bin" >> $GITHUB_PATH - # - name: Install poetry - # run: | - # curl -sSL https://install.python-poetry.org | python - - - # - name: Install project dependencies - # run: | - # poetry install + - name: Install project dependencies + run: | + poetry install --with dev - name: Lint with flake8 run: | # stop the build if there are Python syntax errors or undefined names - flake8 . --count --select=E9,F63,F7,F82 --show-source --statistics + poetry run flake8 . --count --select=E9,F63,F7,F82 --show-source --statistics # exit-zero treats all errors as warnings. The GitHub editor is 127 chars wide - flake8 . --count --exit-zero --max-complexity=10 --max-line-length=127 --statistics - - name: Run Tests - run: python -m unittest tests/test_orcid.py + poetry run flake8 . --count --exit-zero --max-complexity=10 --max-line-length=127 --statistics + - name: Run Tests + run: poetry run python -m unittest tests/test_orcid.py diff --git a/CHANGELOG.md b/CHANGELOG.md index 7fdc3df..025c8e7 100644 --- a/CHANGELOG.md +++ b/CHANGELOG.md @@ -1,6 +1,12 @@ # Changelog All notable changes to this project will be documented in this file. +## [1.2.1] - 11/03/2025 + +### Changed +- Dependencies updated; xmltojson removed in favor of xmltodict for broader compatibility. Now using an up-to-date certifi. +- Moved dependency management to Poetry to better suit our needs. +- Refactored using Claude to standardize linting, comments, add logging, and type hints. ## [1.2.0] - 05/17/2024 diff --git a/CLAUDE.md b/CLAUDE.md new file mode 100644 index 0000000..3ca7e66 --- /dev/null +++ b/CLAUDE.md @@ -0,0 +1,123 @@ +# CLAUDE.md + +This file provides guidance to Claude Code (claude.ai/code) when working with code in this repository. + +## Project Overview + +PyOrcid is a Python library and API client for interacting with the ORCID API. ORCID (Open Researcher and Contributor ID) provides unique identifiers to researchers. This library enables developers to access and manage ORCID profile data, including publications, employment, education, and other research activities. + +## Development Commands + +### Package Management +The project uses Poetry for dependency management: +```bash +# Install dependencies +poetry install + +# Add a dependency +poetry add + +# Add a dev dependency +poetry add --group dev +``` + +### Testing +```bash +# Run all tests +python -m unittest tests/test_orcid.py + +# Run tests with pytest (if installed) +pytest tests/ + +# Run a single test +python -m unittest tests.test_orcid.TestOrcid.test_access_token_valid +``` + +### Linting +```bash +# Check for Python syntax errors and undefined names +flake8 . --count --select=E9,F63,F7,F82 --show-source --statistics + +# Full linting with warnings +flake8 . --count --exit-zero --max-complexity=10 --max-line-length=127 --statistics +``` + +### Building +```bash +# Build the package using Poetry +poetry build +``` + +## Architecture + +### Core Classes + +The library is organized into four main classes in `src/pyorcid/`: + +1. **`Orcid` (orcid.py)** - The main API wrapper class + - Handles both Public and Member API access + - Requires an ORCID ID and access token + - Supports sandbox mode for testing + - Methods map to ORCID API v3.0 endpoints (e.g., `/works`, `/person`, `/educations`) + - Returns tuples: `(processed_data, raw_api_response)` for most section methods + - Key methods: `record()`, `works()`, `educations()`, `employments()`, `fundings()`, `person()` + - Helper: `generate_markdown_file()` creates formatted reports + +2. **`OrcidAuthentication` (orcid_authentication.py)** - Handles OAuth 2.0 authentication + - `get_public_access_token()` - For reading public data (/read-public scope), no user auth required + - `get_private_access_token()` - For Member API or limited-access data, requires user authorization + - Supports both production and sandbox environments + - Manages redirect URIs and authorization codes + +3. **`OrcidScrapper` (orcid_scrapper.py)** - Alternative data access via web scraping + - Inherits from `Orcid` class + - Scrapes public ORCID profiles without authentication + - Converts XML responses to JSON and reformats to match API structure + - Only works with public profiles + - Overrides `__read_section()` to use web scraping instead of API calls + +4. **`OrcidSearch` (orcid_search.py)** - Wrapper for ORCID Search API + - `search(query, start, rows, search_mode, columns)` - Searches ORCID registry + - Supports three search modes: "expanded-search", "search", "csv-search" + - Handles query encoding and pagination + - Requires access token for authentication + +### API Modes + +The library supports two ORCID API modes: +- **Public API** (`state="public"`): Read-only access to public profiles, uses `pub.orcid.org` +- **Member API** (`state="member"`): Read/write access for ORCID members, uses `api.orcid.org` + +Both modes support sandbox environments for testing (`sandbox=True` parameter). + +### Data Processing Pipeline + +1. **Token validation**: All classes except `OrcidScrapper` validate tokens on initialization +2. **API requests**: Made via `__read_section(section)` private method +3. **Data extraction**: Helper methods like `__get_value_from_keys()` navigate nested JSON safely +4. **Formatting**: Methods like `get_formatted_date()` convert API data to user-friendly formats +5. **Unicode handling**: `__deunicode_string()` removes non-ASCII characters for compatibility + +### Testing Approach + +Tests use mocked HTTP requests (unittest.mock) to avoid live API calls. The main `Orcid` class includes special `__test_*` methods that pull credentials from environment variables (`ORCID_ACCESS_TOKEN`) for CI/CD integration with GitHub Actions. + +## Important Patterns + +- **Private methods**: Methods prefixed with `__` (double underscore) are internal-only +- **Error handling**: Token validation occurs in `__init__()` for early failure detection +- **Return tuples**: Section methods typically return `(simplified_data, raw_data)` to provide both convenience and full access +- **Safe navigation**: `__get_value_from_keys()` prevents KeyError on missing nested keys +- **Inheritance**: `OrcidScrapper` extends `Orcid` to reuse data processing logic while changing the data source + +## Dependencies + +Core dependencies: +- `requests` - HTTP client for API calls +- `python-dotenv` - Environment variable management +- `urllib3` - URL handling and encoding +- `xmltojson` - XML to JSON conversion (for scraping) + +Development dependencies: +- `pytest` - Testing framework +- `flake8` - Linting (used in CI/CD) diff --git a/README.md b/README.md index 5157710..bd1be1b 100644 --- a/README.md +++ b/README.md @@ -118,8 +118,9 @@ orcid.__dir__() ```python # Get the information of user's works from their ORCID profile works_data = orcid.works()[0] -for key, value in works_data.items(): - print(key, value) +for work in works_data: + for key, value in work.items(): + print(key, value) ``` ```python diff --git a/poetry.lock b/poetry.lock index a1666d7..d91a4fe 100644 --- a/poetry.lock +++ b/poetry.lock @@ -1,14 +1,15 @@ -# This file is automatically @generated by Poetry 1.6.1 and should not be changed by hand. +# This file is automatically @generated by Poetry 2.2.1 and should not be changed by hand. [[package]] name = "certifi" -version = "2023.7.22" +version = "2025.10.5" description = "Python package for providing Mozilla's CA Bundle." optional = false -python-versions = ">=3.6" +python-versions = ">=3.7" +groups = ["main"] files = [ - {file = "certifi-2023.7.22-py3-none-any.whl", hash = "sha256:92d6037539857d8206b8f6ae472e8b77db8058fec5937a1ef3f54304089edbb9"}, - {file = "certifi-2023.7.22.tar.gz", hash = "sha256:539cc1d13202e33ca466e88b2807e29f4c13049d6d87031a3c110744495cb082"}, + {file = "certifi-2025.10.5-py3-none-any.whl", hash = "sha256:0f212c2744a9bb6de0c56639a6f68afe01ecd92d91f14ae897c4fe7bbeeef0de"}, + {file = "certifi-2025.10.5.tar.gz", hash = "sha256:47c09d31ccf2acf0be3f701ea53595ee7e0b8fa08801c6624be771df09ae7b43"}, ] [[package]] @@ -17,6 +18,7 @@ version = "3.2.0" description = "The Real First Universal Charset Detector. Open, modern and actively maintained alternative to Chardet." optional = false python-versions = ">=3.7.0" +groups = ["main"] files = [ {file = "charset-normalizer-3.2.0.tar.gz", hash = "sha256:3bb3d25a8e6c0aedd251753a79ae98a093c7e7b471faa3aa9a93a81431987ace"}, {file = "charset_normalizer-3.2.0-cp310-cp310-macosx_10_9_universal2.whl", hash = "sha256:0b87549028f680ca955556e3bd57013ab47474c3124dc069faa0b6545b6c9710"}, @@ -101,6 +103,8 @@ version = "0.4.6" description = "Cross-platform colored terminal text." optional = false python-versions = "!=3.0.*,!=3.1.*,!=3.2.*,!=3.3.*,!=3.4.*,!=3.5.*,!=3.6.*,>=2.7" +groups = ["dev"] +markers = "sys_platform == \"win32\"" files = [ {file = "colorama-0.4.6-py2.py3-none-any.whl", hash = "sha256:4f1d9991f5acc0ca119f9d443620b77f9d6b33703e51011c16baf57afb285fc6"}, {file = "colorama-0.4.6.tar.gz", hash = "sha256:08695f5cb7ed6e0531a20572697297273c47b8cae5a63ffc6d6ed5c201be6e44"}, @@ -112,6 +116,8 @@ version = "1.1.3" description = "Backport of PEP 654 (exception groups)" optional = false python-versions = ">=3.7" +groups = ["dev"] +markers = "python_version < \"3.11\"" files = [ {file = "exceptiongroup-1.1.3-py3-none-any.whl", hash = "sha256:343280667a4585d195ca1cf9cef84a4e178c4b6cf2274caef9859782b567d5e3"}, {file = "exceptiongroup-1.1.3.tar.gz", hash = "sha256:097acd85d473d75af5bb98e41b61ff7fe35efe6675e4f9370ec6ec5126d160e9"}, @@ -120,12 +126,30 @@ files = [ [package.extras] test = ["pytest (>=6)"] +[[package]] +name = "flake8" +version = "7.3.0" +description = "the modular source code checker: pep8 pyflakes and co" +optional = false +python-versions = ">=3.9" +groups = ["dev"] +files = [ + {file = "flake8-7.3.0-py2.py3-none-any.whl", hash = "sha256:b9696257b9ce8beb888cdbe31cf885c90d31928fe202be0889a7cdafad32f01e"}, + {file = "flake8-7.3.0.tar.gz", hash = "sha256:fe044858146b9fc69b551a4b490d69cf960fcb78ad1edcb84e7fbb1b4a8e3872"}, +] + +[package.dependencies] +mccabe = ">=0.7.0,<0.8.0" +pycodestyle = ">=2.14.0,<2.15.0" +pyflakes = ">=3.4.0,<3.5.0" + [[package]] name = "idna" version = "3.4" description = "Internationalized Domain Names in Applications (IDNA)" optional = false python-versions = ">=3.5" +groups = ["main"] files = [ {file = "idna-3.4-py3-none-any.whl", hash = "sha256:90b77e79eaa3eba6de819a0c442c0b4ceefc341a7a2ab77d7562bf49f425c5c2"}, {file = "idna-3.4.tar.gz", hash = "sha256:814f528e8dead7d329833b91c5faa87d60bf71824cd12a7530b5526063d02cb4"}, @@ -137,17 +161,31 @@ version = "2.0.0" description = "brain-dead simple config-ini parsing" optional = false python-versions = ">=3.7" +groups = ["dev"] files = [ {file = "iniconfig-2.0.0-py3-none-any.whl", hash = "sha256:b6a85871a79d2e3b22d2d1b94ac2824226a63c6b741c88f7ae975f18b6778374"}, {file = "iniconfig-2.0.0.tar.gz", hash = "sha256:2d91e135bf72d31a410b17c16da610a82cb55f6b0477d1a902134b24a455b8b3"}, ] +[[package]] +name = "mccabe" +version = "0.7.0" +description = "McCabe checker, plugin for flake8" +optional = false +python-versions = ">=3.6" +groups = ["dev"] +files = [ + {file = "mccabe-0.7.0-py2.py3-none-any.whl", hash = "sha256:6c2d30ab6be0e4a46919781807b4f0d834ebdd6c6e3dca0bda5a15f863427b6e"}, + {file = "mccabe-0.7.0.tar.gz", hash = "sha256:348e0240c33b60bbdf4e523192ef919f28cb2c3d7d5c7794f74009290f236325"}, +] + [[package]] name = "packaging" version = "23.1" description = "Core utilities for Python packages" optional = false python-versions = ">=3.7" +groups = ["dev"] files = [ {file = "packaging-23.1-py3-none-any.whl", hash = "sha256:994793af429502c4ea2ebf6bf664629d07c1a9fe974af92966e4b8d2df7edc61"}, {file = "packaging-23.1.tar.gz", hash = "sha256:a392980d2b6cffa644431898be54b0045151319d1e7ec34f0cfed48767dd334f"}, @@ -155,40 +193,82 @@ files = [ [[package]] name = "pluggy" -version = "1.3.0" +version = "1.6.0" description = "plugin and hook calling mechanisms for python" optional = false -python-versions = ">=3.8" +python-versions = ">=3.9" +groups = ["dev"] files = [ - {file = "pluggy-1.3.0-py3-none-any.whl", hash = "sha256:d89c696a773f8bd377d18e5ecda92b7a3793cbe66c87060a6fb58c7b6e1061f7"}, - {file = "pluggy-1.3.0.tar.gz", hash = "sha256:cf61ae8f126ac6f7c451172cf30e3e43d3ca77615509771b3a984a0730651e12"}, + {file = "pluggy-1.6.0-py3-none-any.whl", hash = "sha256:e920276dd6813095e9377c0bc5566d94c932c33b27a3e3945d8389c374dd4746"}, + {file = "pluggy-1.6.0.tar.gz", hash = "sha256:7dcc130b76258d33b90f61b658791dede3486c3e6bfb003ee5c9bfb396dd22f3"}, ] [package.extras] dev = ["pre-commit", "tox"] -testing = ["pytest", "pytest-benchmark"] +testing = ["coverage", "pytest", "pytest-benchmark"] + +[[package]] +name = "pycodestyle" +version = "2.14.0" +description = "Python style guide checker" +optional = false +python-versions = ">=3.9" +groups = ["dev"] +files = [ + {file = "pycodestyle-2.14.0-py2.py3-none-any.whl", hash = "sha256:dd6bf7cb4ee77f8e016f9c8e74a35ddd9f67e1d5fd4184d86c3b98e07099f42d"}, + {file = "pycodestyle-2.14.0.tar.gz", hash = "sha256:c4b5b517d278089ff9d0abdec919cd97262a3367449ea1c8b49b91529167b783"}, +] + +[[package]] +name = "pyflakes" +version = "3.4.0" +description = "passive checker of Python programs" +optional = false +python-versions = ">=3.9" +groups = ["dev"] +files = [ + {file = "pyflakes-3.4.0-py2.py3-none-any.whl", hash = "sha256:f742a7dbd0d9cb9ea41e9a24a918996e8170c799fa528688d40dd582c8265f4f"}, + {file = "pyflakes-3.4.0.tar.gz", hash = "sha256:b24f96fafb7d2ab0ec5075b7350b3d2d2218eab42003821c06344973d3ea2f58"}, +] + +[[package]] +name = "pygments" +version = "2.19.2" +description = "Pygments is a syntax highlighting package written in Python." +optional = false +python-versions = ">=3.8" +groups = ["dev"] +files = [ + {file = "pygments-2.19.2-py3-none-any.whl", hash = "sha256:86540386c03d588bb81d44bc3928634ff26449851e99741617ecb9037ee5ec0b"}, + {file = "pygments-2.19.2.tar.gz", hash = "sha256:636cb2477cec7f8952536970bc533bc43743542f70392ae026374600add5b887"}, +] + +[package.extras] +windows-terminal = ["colorama (>=0.4.6)"] [[package]] name = "pytest" -version = "7.4.1" +version = "8.4.2" description = "pytest: simple powerful testing with Python" optional = false -python-versions = ">=3.7" +python-versions = ">=3.9" +groups = ["dev"] files = [ - {file = "pytest-7.4.1-py3-none-any.whl", hash = "sha256:460c9a59b14e27c602eb5ece2e47bec99dc5fc5f6513cf924a7d03a578991b1f"}, - {file = "pytest-7.4.1.tar.gz", hash = "sha256:2f2301e797521b23e4d2585a0a3d7b5e50fdddaaf7e7d6773ea26ddb17c213ab"}, + {file = "pytest-8.4.2-py3-none-any.whl", hash = "sha256:872f880de3fc3a5bdc88a11b39c9710c3497a547cfa9320bc3c5e62fbf272e79"}, + {file = "pytest-8.4.2.tar.gz", hash = "sha256:86c0d0b93306b961d58d62a4db4879f27fe25513d4b969df351abdddb3c30e01"}, ] [package.dependencies] -colorama = {version = "*", markers = "sys_platform == \"win32\""} -exceptiongroup = {version = ">=1.0.0rc8", markers = "python_version < \"3.11\""} -iniconfig = "*" -packaging = "*" -pluggy = ">=0.12,<2.0" -tomli = {version = ">=1.0.0", markers = "python_version < \"3.11\""} +colorama = {version = ">=0.4", markers = "sys_platform == \"win32\""} +exceptiongroup = {version = ">=1", markers = "python_version < \"3.11\""} +iniconfig = ">=1" +packaging = ">=20" +pluggy = ">=1.5,<2" +pygments = ">=2.7.2" +tomli = {version = ">=1", markers = "python_version < \"3.11\""} [package.extras] -testing = ["argcomplete", "attrs (>=19.2.0)", "hypothesis (>=3.56)", "mock", "nose", "pygments (>=2.7.2)", "requests", "setuptools", "xmlschema"] +dev = ["argcomplete", "attrs (>=19.2)", "hypothesis (>=3.56)", "mock", "requests", "setuptools", "xmlschema"] [[package]] name = "python-dotenv" @@ -196,6 +276,7 @@ version = "1.0.0" description = "Read key-value pairs from a .env file and set them as environment variables" optional = false python-versions = ">=3.8" +groups = ["main"] files = [ {file = "python-dotenv-1.0.0.tar.gz", hash = "sha256:a8df96034aae6d2d50a4ebe8216326c61c3eb64836776504fcca410e5937a3ba"}, {file = "python_dotenv-1.0.0-py3-none-any.whl", hash = "sha256:f5971a9226b701070a4bf2c38c89e5a3f0d64de8debda981d1db98583009122a"}, @@ -210,6 +291,7 @@ version = "2.31.0" description = "Python HTTP for Humans." optional = false python-versions = ">=3.7" +groups = ["main"] files = [ {file = "requests-2.31.0-py3-none-any.whl", hash = "sha256:58cd2187c01e70e6e26505bca751777aa9f2ee0b7f4300988b709f44e013003f"}, {file = "requests-2.31.0.tar.gz", hash = "sha256:942c5a758f98d790eaed1a29cb6eefc7ffb0d1cf7af05c3d2791656dbd6ad1e1"}, @@ -231,6 +313,8 @@ version = "2.0.1" description = "A lil' TOML parser" optional = false python-versions = ">=3.7" +groups = ["dev"] +markers = "python_version < \"3.11\"" files = [ {file = "tomli-2.0.1-py3-none-any.whl", hash = "sha256:939de3e7a6161af0c887ef91b7d41a53e7c5a1ca976325f429cb46ea9bc30ecc"}, {file = "tomli-2.0.1.tar.gz", hash = "sha256:de526c12914f0c550d15924c62d72abc48d6fe7364aa87328337a31007fe8a4f"}, @@ -242,17 +326,33 @@ version = "1.26.16" description = "HTTP library with thread-safe connection pooling, file post, and more." optional = false python-versions = ">=2.7, !=3.0.*, !=3.1.*, !=3.2.*, !=3.3.*, !=3.4.*, !=3.5.*" +groups = ["main"] files = [ {file = "urllib3-1.26.16-py2.py3-none-any.whl", hash = "sha256:8d36afa7616d8ab714608411b4a3b13e58f463aee519024578e062e141dce20f"}, {file = "urllib3-1.26.16.tar.gz", hash = "sha256:8f135f6502756bde6b2a9b28989df5fbe87c9970cecaa69041edcce7f0589b14"}, ] [package.extras] -brotli = ["brotli (>=1.0.9)", "brotlicffi (>=0.8.0)", "brotlipy (>=0.6.0)"] -secure = ["certifi", "cryptography (>=1.3.4)", "idna (>=2.0.0)", "ipaddress", "pyOpenSSL (>=0.14)", "urllib3-secure-extra"] +brotli = ["brotli (>=1.0.9) ; (os_name != \"nt\" or python_version >= \"3\") and platform_python_implementation == \"CPython\"", "brotlicffi (>=0.8.0) ; (os_name != \"nt\" or python_version >= \"3\") and platform_python_implementation != \"CPython\"", "brotlipy (>=0.6.0) ; os_name == \"nt\" and python_version < \"3\""] +secure = ["certifi", "cryptography (>=1.3.4)", "idna (>=2.0.0)", "ipaddress ; python_version == \"2.7\"", "pyOpenSSL (>=0.14)", "urllib3-secure-extra"] socks = ["PySocks (>=1.5.6,!=1.5.7,<2.0)"] +[[package]] +name = "xmltodict" +version = "1.0.2" +description = "Makes working with XML feel like you are working with JSON" +optional = false +python-versions = ">=3.9" +groups = ["main"] +files = [ + {file = "xmltodict-1.0.2-py3-none-any.whl", hash = "sha256:62d0fddb0dcbc9f642745d8bbf4d81fd17d6dfaec5a15b5c1876300aad92af0d"}, + {file = "xmltodict-1.0.2.tar.gz", hash = "sha256:54306780b7c2175a3967cad1db92f218207e5bc1aba697d887807c0fb68b7649"}, +] + +[package.extras] +test = ["pytest", "pytest-cov"] + [metadata] -lock-version = "2.0" -python-versions = "^3.8" -content-hash = "4228ba8ae3e2c6bd962ddb60b8c0cfb7464bf3b17db6ac46295e11aa390617a0" +lock-version = "2.1" +python-versions = ">=3.9" +content-hash = "01ae36abbe96b6f315bf8210ec5fc381dcd59a15de867003aacea39d51ad6379" diff --git a/pyproject.toml b/pyproject.toml index 1576418..4709d0c 100644 --- a/pyproject.toml +++ b/pyproject.toml @@ -18,11 +18,15 @@ classifiers = [ keywords = ["orcid", "orcid-api", "pyorcid"] [tool.poetry.dependencies] -python = "^3.8" +python = ">=3.9" python-dotenv = "*" urllib3 = "^1.26.7" requests = "^2.26.0" -xmltojson = "*" +xmltodict = "*" +certifi = ">=2024.0.0" -[tool.poetry.dev-dependencies] -pytest = "^7.0.0" +[dependency-groups] +dev = [ + "pytest (>=8.4.2,<9.0.0)", + "flake8 (>=7.0.0,<8.0.0)" +] diff --git a/requirements.txt b/requirements.txt deleted file mode 100644 index f515224..0000000 --- a/requirements.txt +++ /dev/null @@ -1,4 +0,0 @@ -python-dotenv -urllib3>=1.26.7 -requests>=2.26.0 -xmltojson diff --git a/reuirements-dev.txt b/reuirements-dev.txt deleted file mode 100644 index f77c910..0000000 --- a/reuirements-dev.txt +++ /dev/null @@ -1 +0,0 @@ -pytest>=7.0.0 diff --git a/src/pyorcid/__init__.py b/src/pyorcid/__init__.py index e2841d4..65cadd7 100644 --- a/src/pyorcid/__init__.py +++ b/src/pyorcid/__init__.py @@ -1,4 +1,6 @@ from .orcid import Orcid from .orcid_authentication import OrcidAuthentication from .orcid_scrapper import OrcidScrapper -from .orcid_search import OrcidSearch \ No newline at end of file +from .orcid_search import OrcidSearch + +__all__ = ["Orcid", "OrcidAuthentication", "OrcidScrapper", "OrcidSearch"] diff --git a/src/pyorcid/orcid.py b/src/pyorcid/orcid.py index 747c5a3..0c441e1 100644 --- a/src/pyorcid/orcid.py +++ b/src/pyorcid/orcid.py @@ -1,111 +1,147 @@ -import requests -from dotenv import load_dotenv +from __future__ import annotations + +import logging import os from datetime import datetime +from typing import Any + +import requests -class Orcid(): +logger = logging.getLogger(__name__) + + +class Orcid: ''' This is a wrapper class for ORCID API ''' - def __init__(self,orcid_id, orcid_access_token = " ", state="public", sandbox=False) -> None: - ''' - Initialize orcid instance - orcid_id : Orcid ID of the user - orcid_access_token : Orcid access token obtained from the user with this orcid_id - state : Whether to use public or member API of ORCID - sandbox : a boolean value to show if the ORCID sandbox API should be used (default: False) - ''' + def __init__( + self, + orcid_id: str, + orcid_access_token: str = " ", + state: str = "public", + sandbox: bool = False + ) -> None: + """Initialize orcid instance. + + Args: + orcid_id: ORCID ID of the user + orcid_access_token: ORCID access token obtained from the user + state: Whether to use "public" or "member" API of ORCID + sandbox: Whether to use ORCID sandbox API for testing + + Raises: + ValueError: If access token is invalid + """ self._orcid_id = orcid_id self._orcid_access_token = orcid_access_token self._state = state self._sandbox = sandbox - #For testing purposes (pytesting on github workflow) - if orcid_access_token!=" ": + self._session = requests.Session() + + # For testing purposes (pytesting on github workflow) + if orcid_access_token.strip() and orcid_access_token != " ": try: self.__test_is_access_token_valid() - except: + except (KeyError, OSError): if not self.__is_access_token_valid(): - raise ValueError(f"Invalid access token! Please make sure the user with ORCID_ID:{orcid_id} has given access.") + raise ValueError( + f"Invalid access token! Please make sure the user with " + f"ORCID_ID:{orcid_id} has given access." + ) - return + def __is_access_token_valid(self) -> bool: + """Check if the current access token is valid. - def __is_access_token_valid(self): - ''' - Checks if the current access token is valid - ''' + Returns: + True if token is valid, False otherwise + + Raises: + ValueError: If access token is empty + """ access_token = self._orcid_access_token - if access_token=="": - raise ValueError("Empty value for access token! Please make sure you are authenticated by ORCID as developer.") - # Make a test request to the API using the token + if not access_token or access_token.strip() == "": + raise ValueError( + "Empty value for access token! Please make sure you are " + "authenticated by ORCID as developer." + ) + headers = { 'Authorization': f'Bearer {access_token}', 'Content-Type': 'application/json' } - api_url = "" + api_url = self.__get_api_url() - if self._state == "public": - # Specify the ORCID record endpoint for the desired ORCID iD - api_url = f'https://pub.orcid.org/v3.0/{self._orcid_id}' - if(self._sandbox): - api_url = f'https://pub.sandbox.orcid.org/v3.0/{self._orcid_id}' #for testing + try: + response = self._session.get(api_url, headers=headers, timeout=10) + # Status code 200 or 404 means token is valid (404 just means empty profile) + return response.status_code in (200, 404) + except requests.RequestException as e: + logger.warning(f"Failed to validate access token: {e}") + return False - elif self._state == "member": - api_url = f'https://api.orcid.org/v3.0/{self._orcid_id}' - if(self._sandbox): - api_url = f'https://api.sandbox.orcid.org/v3.0/{self._orcid_id}' #for testing + def __get_api_url(self, section: str = "") -> str: + """Construct API URL based on state and sandbox settings. - response = requests.get(api_url, headers=headers) + Args: + section: Optional API section to append to URL - if response.status_code == 404: - # The request was successful, and the token is likely valid - return False + Returns: + Complete API URL + """ + if self._state == "public": + base = ("https://pub.sandbox.orcid.org" if self._sandbox + else "https://pub.orcid.org") + elif self._state == "member": + base = ("https://api.sandbox.orcid.org" if self._sandbox + else "https://api.orcid.org") else: - # The request failed, indicating that the token may have expired or is invalid - return True - - - def __read_section(self,section="record"): - ''' - Reads the section of a Orcid member Profile - return : a dictionary of summary view of the section of ORCID data - ''' - + raise ValueError( + f"Invalid state: {self._state}. " + "Must be 'public' or 'member'." + ) + + url = f"{base}/v3.0/{self._orcid_id}" + if section: + url = f"{url}/{section}" + return url + + def __read_section(self, section: str = "record") -> dict[str, Any] | None: + """Read a section of an ORCID profile. + + Args: + section: The ORCID API section to read (default: "record") + + Returns: + Dictionary containing the section data, or None if request fails + + Raises: + requests.RequestException: If the API request fails + """ access_token = self._orcid_access_token - # Set the headers with the access token for authentication headers = { 'Authorization': f'Bearer {access_token}', 'Content-Type': 'application/json' } - api_url = "" - - if self._state == "public": - # Specify the ORCID record endpoint for the desired ORCID iD - api_url = f'https://pub.orcid.org/v3.0/{self._orcid_id}/{section}' - if(self._sandbox): - api_url = f'https://pub.sandbox.orcid.org/v3.0/{self._orcid_id}/{section}' #for testing - - elif self._state == "member": - api_url = f'https://api.orcid.org/v3.0/{self._orcid_id}/{section}' - if(self._sandbox): - api_url = f'https://api.sandbox.orcid.org/v3.0/{self._orcid_id}/{section}' #for testing + api_url = self.__get_api_url(section) - # Make a GET request to retrieve the ORCID record - response = requests.get(api_url, headers=headers) - - # The request was successful - data = response.json() - # Check the response status code - if response.status_code == 200 or data is not None: - return data - else: - # Handle the case where the request failed - print("Failed to retrieve ORCID data. Status code:", response.status_code) + try: + response = self._session.get(api_url, headers=headers, timeout=30) + response.raise_for_status() + return response.json() + except requests.HTTPError as e: + logger.error(f"Failed to retrieve ORCID data. Status code: {e.response.status_code}") + return None + except requests.RequestException as e: + logger.error(f"Request failed: {e}") return None - + except ValueError as e: + logger.error(f"Failed to parse JSON response: {e}") + return None + def __timestamp_to_iso_date(self, timestamp): ''' Converts a timestamp to an ISO date string @@ -130,7 +166,7 @@ def __timestamp_to_iso_date(self, timestamp): return iso8601 except Exception as e: raise ValueError(f"Error: {e}") - + def __deunicode_string(self, s): ''' Removes non-ASCII characters from a string @@ -147,135 +183,159 @@ def __deunicode_string(self, s): def record(self): ''' Reads the Orcid record - return : a dictionary of summary view of the full ORCID record + return : a dictionary of summary view of the full ORCID record ''' return self.__read_section("record") - + def person(self): ''' - Read biographical section of the ORCID record, including through /researcher-urls below + Read biographical section of the ORCID record, including + through /researcher-urls below return : dict with name, biography, researcher-urls ''' - data = self.__read_section("person") - name = self.__get_value_from_keys(data,["name","given-names", "value"]) - bio = self.__get_value_from_keys(data,["biography", "content"]) + data = self.__read_section("person") + name = self.__get_value_from_keys( + data, ["name", "given-names", "value"]) + bio = self.__get_value_from_keys(data, ["biography", "content"]) urls = [] - if "researcher-urls" in data and "researcher-url" in data["researcher-urls"]: + if ("researcher-urls" in data and + "researcher-url" in data["researcher-urls"]): researcher_urls = data["researcher-urls"]["researcher-url"] if isinstance(researcher_urls, list): for research_url in researcher_urls: - url_name = self.__get_value_from_keys(research_url,["url-name"]) - url_value = self.__get_value_from_keys(research_url,["url","value"]) + url_name = self.__get_value_from_keys( + research_url, ["url-name"]) + url_value = self.__get_value_from_keys( + research_url, ["url", "value"]) if url_name or url_value: - urls.append({"URL Name":url_name,"URL": url_value}) + urls.append({ + "URL Name": url_name, + "URL": url_value + }) + + return {"Name": name, "Bio": bio, "URLs": urls} - return {"Name":name,"Bio":bio,"URLs":urls} - def address(self): ''' The researcher's countries or regions return : ''' - return self.__read_section("address") - + return self.__read_section("address") + def email(self): ''' The email address(es) associated with the record - return : A tuple of list of emails and whole info tree related to email from orcid + return : A tuple of list of emails and whole info tree + related to email from orcid ''' - data = self.__read_section("email") - emails = [email['email'] for email in self.__get_value_from_keys(data,["person","emails","email"])] + data = self.__read_section("email") + emails = [email['email'] for email in self.__get_value_from_keys( + data, ["person", "emails", "email"])] return emails, data - + def external_identifiers(self): ''' Linked external identifiers in other systems return : ''' - return self.__read_section("external-identifiers") - + return self.__read_section("external-identifiers") + def keywords(self): ''' Keywords related to the researcher and their work - return : A tuple of list of keywords and whole info tree related to keywords from orcid + return : A tuple of list of keywords and whole info tree + related to keywords from orcid ''' - data = self.__read_section("keywords") - lis = [(value["content"]) for value in data["keyword"]] + data = self.__read_section("keywords") + lis = [value["content"] for value in data["keyword"]] return (lis, data) - + def other_names(self): ''' Other names by which the researcher is known return : ''' - return self.__read_section("other-names") - + return self.__read_section("other-names") + def personal_details(self): ''' - Personal details: the researcher's name, credit (published) name, and biography + Personal details: the researcher's name, credit (published) + name, and biography return : ''' - return self.__read_section("personal-details") - + return self.__read_section("personal-details") + def researcher_urls(self): ''' - Links to the researcher‚s personal or profile pages + Links to the researcher's personal or profile pages return : ''' - return self.__read_section("researcher-urls") - + return self.__read_section("researcher-urls") + def activities(self): ''' - Summary of the activities section of the ORCID record, including through /works below. + Summary of the activities section of the ORCID record, + including through /works below. return : ''' - return self.__read_section("activities") - + return self.__read_section("activities") + def educations(self): ''' Education affiliations - return : a tuple containing the Education details and the whole info tree related to education from orcid + return : a tuple containing the Education details and the whole + info tree related to education from orcid ''' - data = self.__read_section("educations") + data = self.__read_section("educations") edu = self.__extract_details(data, "education") - return (edu,data) - + return (edu, data) + def employments(self): ''' Employment affiliations - return : a tuple containing the Employment details and the whole info tree related to employment from orcid + return : a tuple containing the Employment details and the whole + info tree related to employment from orcid ''' - data = self.__read_section("employments") + data = self.__read_section("employments") employments = self.__extract_details(data, "employment") - return (employments,data) - + return (employments, data) + def fundings(self): ''' Summary of funding activities - return : a tuple containing the Funding details and the whole info tree related to funding from orcid + return : a tuple containing the Funding details and the whole + info tree related to funding from orcid ''' funding_details = [] - data = self.__read_section("fundings") + data = self.__read_section("fundings") group = data.get('group', []) for funding_summary in group: funding_summaries = funding_summary.get('funding-summary', []) for fund_summary in funding_summaries: - title = self.__get_value_from_keys(fund_summary,["title","title","value"]) - fund_type = self.__get_value_from_keys(fund_summary,["type"]) - start_date = self.get_formatted_date(fund_summary.get('start-date', {})) - end_date = self.get_formatted_date(fund_summary.get('end-date', {})) - organization= self.__get_value_from_keys(fund_summary,["organization","name"]) - organization_address = self.__org_string_from_obj(self.__get_value_from_keys(fund_summary, ["organization", "address"])) - url = self.__get_value_from_keys(fund_summary,["url","value"]) + title = self.__get_value_from_keys( + fund_summary, ["title", "title", "value"]) + fund_type = self.__get_value_from_keys( + fund_summary, ["type"]) + start_date = self.get_formatted_date( + fund_summary.get('start-date', {})) + end_date = self.get_formatted_date( + fund_summary.get('end-date', {})) + organization = self.__get_value_from_keys( + fund_summary, ["organization", "name"]) + organization_address = self.__org_string_from_obj( + self.__get_value_from_keys( + fund_summary, ["organization", "address"])) + url = self.__get_value_from_keys( + fund_summary, ["url", "value"]) funding_detail = { 'title': title, @@ -289,21 +349,22 @@ def fundings(self): funding_details.append(funding_detail) - return (funding_details,data) - + return (funding_details, data) + def peer_reviews(self): ''' Summary of peer review activities return : ''' - return self.__read_section("peer-reviews") - + return self.__read_section("peer-reviews") + def works(self): ''' Summary of research works - return : a tuple containing the Work details and the whole info tree related to work from orcid + return : a tuple containing the Work details and the whole + info tree related to work from orcid ''' - data = self.__read_section("works") + data = self.__read_section("works") work_details = [] group = data.get('group', []) @@ -312,13 +373,21 @@ def works(self): work_summaries = work_summary.get('work-summary', []) for work_summary in work_summaries: - title = self.__get_value_from_keys(work_summary,["title","title","value"]) - work_type = self.__get_value_from_keys(work_summary,["type"]) - publication_date= self.get_formatted_date(work_summary.get('publication-date', {})) - journal_title = self.__get_value_from_keys(work_summary,["journal-title","value"]) - organization = self.__get_value_from_keys(work_summary,["organization","name"]) - organization_address = self.__org_string_from_obj(self.__get_value_from_keys(work_summary, ["organization", "address"])) - url = self.__get_value_from_keys(work_summary,["url","value"]) + title = self.__get_value_from_keys( + work_summary, ["title", "title", "value"]) + work_type = self.__get_value_from_keys( + work_summary, ["type"]) + publication_date = self.get_formatted_date( + work_summary.get('publication-date', {})) + journal_title = self.__get_value_from_keys( + work_summary, ["journal-title", "value"]) + organization = self.__get_value_from_keys( + work_summary, ["organization", "name"]) + organization_address = self.__org_string_from_obj( + self.__get_value_from_keys( + work_summary, ["organization", "address"])) + url = self.__get_value_from_keys( + work_summary, ["url", "value"]) work_detail = { 'title': title, @@ -332,84 +401,92 @@ def works(self): work_details.append(work_detail) - return (work_details,data) - - def research_resources (self): + return (work_details, data) + + def research_resources(self): ''' - Summary of research resources + Summary of research resources return : ''' - return self.__read_section("research-resources") - + return self.__read_section("research-resources") + def services(self): ''' - Summary of services - return : a tuple containing the Service details and the whole info tree related to service from orcid + Summary of services + return : a tuple containing the Service details and the whole + info tree related to service from orcid ''' - data = self.__read_section("services") + data = self.__read_section("services") services = self.__extract_details(data, "service") - return (services,data) - + return (services, data) + def qualifications(self): ''' - Summary of qualifications - return : a tuple containing the Qualification details and the whole info tree related to qualification from orcid + Summary of qualifications + return : a tuple containing the Qualification details and the + whole info tree related to qualification from orcid ''' - data = self.__read_section("qualifications") + data = self.__read_section("qualifications") qualifications = self.__extract_details(data, "qualification") - return (qualifications,data) - + return (qualifications, data) + def memberships(self): ''' - Summary of memberships - return : a tuple containing the Membership details and the whole info tree related to membership from orcid + Summary of memberships + return : a tuple containing the Membership details and the whole + info tree related to membership from orcid ''' - data = self.__read_section("memberships") + data = self.__read_section("memberships") mem = self.__extract_details(data, "membership") - return (mem,data) - + return (mem, data) + def distinctions(self): ''' - Summary of distinctions - return : a tuple containing the distinction details and the whole info tree related to distinction from orcid + Summary of distinctions + return : a tuple containing the distinction details and the whole + info tree related to distinction from orcid ''' - data = self.__read_section("distinctions") + data = self.__read_section("distinctions") distinctions = self.__extract_details(data, "distinction") - return (distinctions,data) - + return (distinctions, data) + def invited_positions(self): ''' Summary of invited positions - return : a tuple containing the invited position details and the whole info tree related to invited position from orcid + return : a tuple containing the invited position details and the + whole info tree related to invited position from orcid ''' - data = self.__read_section("invited-positions") + data = self.__read_section("invited-positions") invited_pos = self.__extract_details(data, "invited-position") - return (invited_pos,data) - - def get_formatted_date(self,date_dict): + return (invited_pos, data) + + def get_formatted_date(self, date_dict): """ - Formats a date dictionary into a string (e.g., "MM/YYYY") if all required keys are present and not None. + Formats a date dictionary into a string (e.g., "MM/YYYY") if all + required keys are present and not None. Args: - date_dict (dict): A dictionary containing 'year', 'month', and 'day' keys. + date_dict (dict): A dictionary containing 'year', 'month', + and 'day' keys. Returns: - str: The formatted date string or an empty string if any required key is missing or None. + str: The formatted date string or an empty string if any + required key is missing or None. """ if date_dict is not None: - year = self.__get_value_from_keys(date_dict,["year","value"]) - month = self.__get_value_from_keys(date_dict,["month","value"]) - day = self.__get_value_from_keys(date_dict,["day","value"]) + year = self.__get_value_from_keys(date_dict, ["year", "value"]) + month = self.__get_value_from_keys( + date_dict, ["month", "value"]) # Check if all required keys are present and not None if year is not None and month is not None: @@ -421,16 +498,18 @@ def get_formatted_date(self,date_dict): else: return '' - def __are_keys_accessible(self,json_obj, keys): + def __are_keys_accessible(self, json_obj, keys): """ - Check if all keys are accessible cumulatively in the JSON-like object. + Check if all keys are accessible cumulatively in the JSON-like + object. Args: json_obj (dict): The JSON-like object (dictionary). keys (list): List of keys to check for accessibility. Returns: - bool: True if all keys are accessible cumulatively, False otherwise. + bool: True if all keys are accessible cumulatively, + False otherwise. """ current_obj = json_obj @@ -444,14 +523,17 @@ def __are_keys_accessible(self,json_obj, keys): def __get_value_from_keys(self, json_obj, keys): """ - Get the value associated with the last key in the list if all keys are accessible cumulatively. + Get the value associated with the last key in the list if all keys + are accessible cumulatively. Args: json_obj (dict): The JSON-like object (dictionary). - keys (list): List of keys to check for accessibility and retrieve the final value. + keys (list): List of keys to check for accessibility and retrieve + the final value. Returns: - Any: The value associated with the last key if all keys are accessible cumulatively, or None if not accessible. + Any: The value associated with the last key if all keys are + accessible cumulatively, or None if not accessible. """ if self.__are_keys_accessible(json_obj, keys): current_obj = json_obj @@ -462,31 +544,39 @@ def __get_value_from_keys(self, json_obj, keys): return current_obj else: return None - + def __extract_details(self, data, key): ''' Helper function for record_summary() ''' details = [] - + # Extract the 'affiliation-group' from the data affiliation_group = data.get('affiliation-group', []) - + for group in affiliation_group: summaries = group.get('summaries', []) - + for summary in summaries: key_summary = summary.get(f'{key}-summary', {}) - department = self.__get_value_from_keys(key_summary,["department-name"]) - role = self.__get_value_from_keys(key_summary,["role-title"]) - start_date = self.get_formatted_date(key_summary.get('start-date', {})) - end_date = self.get_formatted_date(key_summary.get('end-date', {})) - organization = self.__get_value_from_keys(key_summary,["organization","name"]) - + department = self.__get_value_from_keys( + key_summary, ["department-name"]) + role = self.__get_value_from_keys( + key_summary, ["role-title"]) + start_date = self.get_formatted_date( + key_summary.get('start-date', {})) + end_date = self.get_formatted_date( + key_summary.get('end-date', {})) + organization = self.__get_value_from_keys( + key_summary, ["organization", "name"]) + # Extract the organization address components into a string - organization_address = self.__org_string_from_obj(self.__get_value_from_keys(key_summary, ["organization", "address"])) + organization_address = self.__org_string_from_obj( + self.__get_value_from_keys( + key_summary, ["organization", "address"])) - url = self.__get_value_from_keys(key_summary,["url","value"]) + url = self.__get_value_from_keys( + key_summary, ["url", "value"]) detail = { 'Department': department, 'Role': role, @@ -496,11 +586,11 @@ def __extract_details(self, data, key): 'organization-address': organization_address, 'url': url, } - + details.append(detail) - + return details - + def __org_string_from_obj(self, org_obj): ''' Helper function for record_summary() @@ -509,13 +599,13 @@ def __org_string_from_obj(self, org_obj): if not isinstance(org_obj, dict): return org_string - # Build a string from the organization components without unicode characters + # Build a string from organization components without unicode org_parts = filter(None, org_obj.values()) if org_parts is not None: - org_string = ', '.join([self.__deunicode_string(i) for i in org_parts]) - - return org_string + org_string = ', '.join([ + self.__deunicode_string(i) for i in org_parts]) + return org_string def record_summary(self): ''' @@ -523,52 +613,72 @@ def record_summary(self): return : a dictionary of summary view of the full ORCID record ''' data = self.record() + last_modified_value = self.__get_value_from_keys( + data, ["history", "last-modified-date", "value"]) extracted_data = { 'ORCiD ID': self._orcid_id, - 'Last Modified': self.__timestamp_to_iso_date(self.__get_value_from_keys(data,["history","last-modified-date","value"])), - 'Name': self.__get_value_from_keys(data,["person","name","given-names","value"]), - 'Family Name': self.__get_value_from_keys(data,["person","name","family-name","value"]), - 'Credit Name': self.__get_value_from_keys(data,["person","name","credit-name","value"]), - 'Other Names': [name['content'] for name in self.__get_value_from_keys(data,["person","other-names","other-name"])], - 'Biography': self.__get_value_from_keys(data,["person","biography","content"]), - 'Emails': [email['email'] for email in self.__get_value_from_keys(data,["person","emails","email"])], - 'Research Tags (keywords)': [keyword['content'] for keyword in self.__get_value_from_keys(data,["person","keywords","keyword"])], + 'Last Modified': self.__timestamp_to_iso_date( + last_modified_value), + 'Name': self.__get_value_from_keys( + data, ["person", "name", "given-names", "value"]), + 'Family Name': self.__get_value_from_keys( + data, ["person", "name", "family-name", "value"]), + 'Credit Name': self.__get_value_from_keys( + data, ["person", "name", "credit-name", "value"]), + 'Other Names': [ + name['content'] for name in self.__get_value_from_keys( + data, ["person", "other-names", "other-name"])], + 'Biography': self.__get_value_from_keys( + data, ["person", "biography", "content"]), + 'Emails': [ + email['email'] for email in self.__get_value_from_keys( + data, ["person", "emails", "email"])], + 'Research Tags (keywords)': [ + keyword['content'] for keyword in self.__get_value_from_keys( + data, ["person", "keywords", "keyword"])], } # Extract education details education_details = self.educations()[0] - if education_details: extracted_data['Education'] = education_details + if education_details: + extracted_data['Education'] = education_details # Extract education details qualification_details = self.qualifications()[0] - if qualification_details: extracted_data['Quaifications'] = qualification_details + if qualification_details: + extracted_data['Quaifications'] = qualification_details # Extract employment details employment_details = self.employments()[0] - if employment_details: extracted_data['Employment'] = employment_details + if employment_details: + extracted_data['Employment'] = employment_details # Extract education details distinction_details = self.distinctions()[0] - if distinction_details: extracted_data['Distinctions'] = distinction_details + if distinction_details: + extracted_data['Distinctions'] = distinction_details # Extract employment details - Invited_details = self.invited_positions()[0] - if Invited_details: extracted_data['Invited Positions'] = Invited_details + invited_details = self.invited_positions()[0] + if invited_details: + extracted_data['Invited Positions'] = invited_details # Extract education details membership_details = self.memberships()[0] - if membership_details: extracted_data['Memberships'] = membership_details + if membership_details: + extracted_data['Memberships'] = membership_details # Extract service details service_details = self.services()[0] - if service_details: extracted_data['Service'] = service_details + if service_details: + extracted_data['Service'] = service_details # Extract funding details with start and end dates extracted_data['Fundings'] = self.fundings()[0] extracted_data['Works'] = self.works()[0] return extracted_data - + def generate_markdown_file(self, output_file=None): ''' Generates a markdown file with the ORCID record summary @@ -580,7 +690,8 @@ def generate_markdown_file(self, output_file=None): if 'Name' in data: file_name = f"{data['Name']}.md" else: - file_name = "output.md" # Default file name if 'Name' field is missing + # Default file name if 'Name' field is missing + file_name = "output.md" if output_file is not None: file_name = output_file @@ -588,28 +699,31 @@ def generate_markdown_file(self, output_file=None): with open(file_name, 'w', encoding='utf-8') as md_file: for section, content in data.items(): md_file.write(f"## {section}\n\n") - + if isinstance(content, list): if content: if isinstance(content[0], dict): keys = content[0].keys() md_file.write("| " + " | ".join(keys) + " |\n") - md_file.write("| " + " | ".join(["---"] * len(keys)) + " |\n") + md_file.write( + "| " + " | ".join(["---"] * len(keys)) + + " |\n") for item in content: - md_file.write("| " + " | ".join(str(item[key]) for key in keys) + " |\n") + md_file.write( + "| " + " | ".join( + str(item[key]) for key in keys) + + " |\n") else: for item in content: - md_file.write("- " + f"{item}\n") + md_file.write(f"- {item}\n") else: md_file.write("No data available.\n") else: md_file.write(f"{content}\n") - - md_file.write("\n") - + md_file.write("\n") - ## THESE FUNCTIONS ARE FOR TESTING PURPOSES ## + # THESE FUNCTIONS ARE FOR TESTING PURPOSES def __test_is_access_token_valid(self): ''' FOR TESTING PURPOSES ONLY @@ -617,8 +731,10 @@ def __test_is_access_token_valid(self): ''' # Access the environment variable from github secrets access_token = os.environ["ORCID_ACCESS_TOKEN"] - if access_token=="": - raise ValueError("Empty value for access token! Please make sure you are authenticated by ORCID as developer.") + if access_token == "": + raise ValueError( + "Empty value for access token! Please make sure you are " + "authenticated by ORCID as developer.") # Make a test request to the API using the token headers = { 'Authorization': f'Bearer {access_token}', @@ -629,24 +745,27 @@ def __test_is_access_token_valid(self): if self._state == "public": # Specify the ORCID record endpoint for the desired ORCID iD - api_url = f'https://pub.sandbox.orcid.org/v3.0/{self._orcid_id}' - + api_url = (f'https://pub.sandbox.orcid.org/v3.0/' + f'{self._orcid_id}') + elif self._state == "member": - api_url = f'https://api.sandbox.orcid.org/v3.0/{self._orcid_id}' + api_url = (f'https://api.sandbox.orcid.org/v3.0/' + f'{self._orcid_id}') response = requests.get(api_url, headers=headers) if response.status_code == 404: # The request was successful, and the token is likely valid return False else: - # The request failed, indicating that the token may have expired or is invalid + # The request failed, token may have expired or is invalid return True - - def __test_read_section(self,section="record"): + + def __test_read_section(self, section="record"): ''' FOR TESTING PURPOSES ONLY Reads the section of a Orcid member Profile - return : a dictionary of summary view of the section of ORCID data + return : a dictionary of summary view of the section of + ORCID data ''' access_token = os.environ["ORCID_ACCESS_TOKEN"] @@ -661,10 +780,12 @@ def __test_read_section(self,section="record"): if self._state == "public": # Specify the ORCID record endpoint for the desired ORCID iD - api_url = f'https://pub.sandbox.orcid.org/v3.0/{self._orcid_id}/{section}' - + api_url = (f'https://pub.sandbox.orcid.org/v3.0/' + f'{self._orcid_id}/{section}') + elif self._state == "member": - api_url = f'https://api.sandbox.orcid.org/v3.0/{self._orcid_id}/{section}' + api_url = (f'https://api.sandbox.orcid.org/v3.0/' + f'{self._orcid_id}/{section}') # Make a GET request to retrieve the ORCID record response = requests.get(api_url, headers=headers) @@ -676,14 +797,14 @@ def __test_read_section(self,section="record"): return data else: # Handle the case where the request failed - print("Failed to retrieve ORCID data. Status code:", response.status_code) + print("Failed to retrieve ORCID data. Status code:", + response.status_code) return None def __test_record(self): ''' FOR TESTING PURPOSES ONLY Reads the Orcid record - return : a dictionary of summary view of the full ORCID record + return : a dictionary of summary view of the full ORCID record ''' return self.__test_read_section("record") - diff --git a/src/pyorcid/orcid_authentication.py b/src/pyorcid/orcid_authentication.py index 25954c3..813d522 100644 --- a/src/pyorcid/orcid_authentication.py +++ b/src/pyorcid/orcid_authentication.py @@ -1,34 +1,46 @@ -import requests +from __future__ import annotations + +import logging from urllib.parse import urlencode +import requests + +logger = logging.getLogger(__name__) + + class OrcidAuthentication: ''' - OrcidAuthentication is a class that handles the Orcid's OAuth 2.0 authorrization. - The Orcid's OAuth 2.0 authorrization is used to access the ORCID record of the user that gave access. - + OrcidAuthentication is a class that handles the Orcid's OAuth 2.0 + authorrization. + The Orcid's OAuth 2.0 authorrization is used to access the ORCID + record of the user that gave access. ''' - def __init__(self, client_id, client_secret, redirect_uri="", sandbox=False): - ''' - initializes the ORCidAuthentication and gets the access token - Parameters - ---------- - client_id : str : client id obtained from the registered application - client_secret : str : client secret obtained from the registered application - redirect_uri : str : redirect uri obtained from the registered application - sandbox : bool : a boolean value to show if the ORCID sandbox API should be used (default: False) + def __init__( + self, + client_id: str, + client_secret: str, + redirect_uri: str = "", + sandbox: bool = False + ) -> None: + """Initialize ORCID Authentication handler. - ''' + Args: + client_id: Client ID from registered ORCID application + client_secret: Client secret from registered ORCID application + redirect_uri: Redirect URI from registered ORCID application + sandbox: Whether to use ORCID sandbox API for testing + """ self.__client_id = client_id self.__client_secret = client_secret self.__redirect_uri = redirect_uri self.__sandbox = sandbox - return None - - + self._session = requests.Session() + def get_private_access_token(self): ''' Send a request for Orcid's OAuth 2.0 authorrization - This method is used for Member API (read/update) and Public API's /read-limited scope + This method is used for Member API (read/update) and Public API's + /read-limited scope Requires user authorization ''' @@ -36,7 +48,7 @@ def get_private_access_token(self): auth_url_endpoint = "https://orcid.org/oauth/authorize" token_url = "https://orcid.org/oauth/token" - if(self.__sandbox): + if self.__sandbox: auth_url_endpoint = "https://sandbox.orcid.org/oauth/authorize" token_url = "https://sandbox.orcid.org/oauth/token" @@ -51,7 +63,9 @@ def get_private_access_token(self): print(f'Please go to this URL and authorize the app: {auth_url}') print("\n") # Step 2: Get the authorization code from the redirect URL - redirect_response = input('Paste the full URL of the page you were redirected to after authorizing: ') + redirect_response = input( + 'Paste the full URL of the page you were redirected to after ' + 'authorizing: ') code = redirect_response.split('code=')[1].split('&')[0] # Step 3: Exchange the authorization code for an access token @@ -67,17 +81,18 @@ def get_private_access_token(self): access_token = response.json().get('access_token') # set_key(".env", "ORCID_ACCESS_TOKEN", access_token) return access_token - + def get_public_access_token(self): """ - This method gets token for reading public data (/read-public scope) from Orcid. - Doesnt' require user authentication + This method gets token for reading public data (/read-public + scope) from Orcid. + Doesnt' require user authentication return: access token """ - scope='/read-public' + scope = '/read-public' token_url = "https://orcid.org/oauth/token" - - if(self.__sandbox): + + if self.__sandbox: token_url = "https://sandbox.orcid.org/oauth/token" params = { @@ -90,8 +105,8 @@ def get_public_access_token(self): try: response = requests.post(token_url, data=params, headers=headers) - # # Raises an exception for HTTP errors - response.raise_for_status() + # Raises an exception for HTTP errors + response.raise_for_status() access_token = response.json().get('access_token') return access_token @@ -99,19 +114,21 @@ def get_public_access_token(self): except requests.exceptions.RequestException as e: print(f"Error during token retrieval: {e}") return None - + def save_credentials(self, access_token): ''' Save the credentials and access token to a file ''' - print("Do you want to save credentials along with the access token? (y/n)") + print("Do you want to save credentials along with the access " + "token? (y/n)") choice = input().strip().lower() - + if choice == 'y': - print("The details will be saved in 'orcid_credentials.env' in the current working directory.") + print("The details will be saved in 'orcid_credentials.env' " + "in the current working directory.") print("Are you sure you want to continue? (y/n)") confirmation = input().strip().lower() - + if confirmation == 'y': # Save credentials and access token to a file with open('orcid_credentials.env', 'w') as file: @@ -123,8 +140,9 @@ def save_credentials(self, access_token): else: print("Credentials and access token not saved.") else: - print("This is the access token. Please retain this to access the ORCID record of the user that gave access along with their ORCID_ID.") + print("This is the access token. Please retain this to access " + "the ORCID record of the user that gave access along with " + "their ORCID_ID.") print(access_token) return None - diff --git a/src/pyorcid/orcid_scrapper.py b/src/pyorcid/orcid_scrapper.py index b6e9ea0..b1ee6af 100644 --- a/src/pyorcid/orcid_scrapper.py +++ b/src/pyorcid/orcid_scrapper.py @@ -1,54 +1,80 @@ -from .orcid import Orcid +from __future__ import annotations + +import logging +from typing import Any + import requests -import xmltojson -import json +import xmltodict + +from .orcid import Orcid + +logger = logging.getLogger(__name__) + class OrcidScrapper(Orcid): ''' - This is an alternative way to access public data on Orcid website through web-scraping + This is an alternative way to access public data on Orcid website + through web-scraping Inherited from Orcid class ''' - def __init__(self,orcid_id): - ''' - Initializes the OrcidScrapper class - ''' + def __init__(self, orcid_id: str) -> None: + """Initialize the OrcidScrapper class. + + Args: + orcid_id: ORCID ID of the user + """ super().__init__(orcid_id) - return None - def __read_section(self, section="record"): ''' Reads the section of a Orcid member Profile - return : a dictionary of summary view of the section of ORCID data + return : a dictionary of summary view of the section of + ORCID data ''' - url = f"https://pub.orcid.org/v3.0/{self.orcid_id}/{section}" + url = f"https://pub.orcid.org/v3.0/{self._orcid_id}/{section}" data = self.__orcid_web_scrapper(url) - print("asfffffffffffffffffffffffffffffffffffffffffffffffffffffff") return data[section] - - def __orcid_web_scrapper(self,url): - ''' - Scrape the data from the url (orcid public webpage : pub.orcid.org) - ''' - xml_data = requests.get(url).content - #convert the xml tree scraped to a json tree - json_data = xmltojson.parse(xml_data) - #json string to json tree - data = json.loads(json_data) + def __orcid_web_scrapper(self, url: str) -> dict[str, Any]: + """Scrape data from the ORCID public webpage. + + Args: + url: URL to scrape data from + + Returns: + Dictionary containing parsed data - #reformat the json tree + Raises: + requests.RequestException: If HTTP request fails + Exception: If XML parsing fails + """ + try: + response = self._session.get(url, timeout=30) + response.raise_for_status() + xml_data = response.content + + # Convert the XML tree scraped to a dict + data = xmltodict.parse(xml_data) + except requests.RequestException as e: + logger.error(f"Failed to fetch data from {url}: {e}") + raise + except Exception as e: + logger.error(f"Failed to parse XML data: {e}") + raise + + # reformat the json tree renamed_data = self.__rename_keys(data) result = self.__remove_metadata(renamed_data) return result - - def __rename_keys(self,data): + def __rename_keys(self, data): ''' - Reformats and renames the keys of a data dictionary acquired + Reformats and renames the keys of a data dictionary acquired thorugh scraping to match the names of keys accessed through API - return : a dictionary of summary view of the section of ORCID data''' + return : a dictionary of summary view of the section of + ORCID data + ''' if isinstance(data, dict): new_data = {} for key, value in data.items(): @@ -63,38 +89,53 @@ def __rename_keys(self,data): else: return data - def __remove_metadata(self,data): + def __remove_metadata(self, data): ''' - Removes unnecessary metadata from a data dictionary acquired + Removes unnecessary metadata from a data dictionary acquired ''' - result={} - #section name like record, works, activities-summary + result = {} + # section name like record, works, activities-summary section = list(data.keys())[0] keys_to_remove = list(data[section].keys())[0:30] - newdata = {key1: data[section][key1] for key1 in data[section] if key1 not in keys_to_remove} - result[section]=newdata + newdata = { + key1: data[section][key1] for key1 in data[section] + if key1 not in keys_to_remove + } + result[section] = newdata return result - - + def __extract_details(self, data, key): ''' - Helper function for __read_section to reading various sections for orcid profile + Helper function for __read_section to reading various sections + for orcid profile ''' details = [] # Extract the 'affiliation-group' from the data affiliation_group = data.get('affiliation-group', []) - + for summary in affiliation_group: - + key_summary = summary.get(f'{key}-summary', {}) - department = self.__get_value_from_keys(key_summary,["department-name"]) - role = self.__get_value_from_keys(key_summary,["role-title"]) - start_date = self.get_formatted_date(key_summary.get('start-date', {})) - end_date = self.get_formatted_date(key_summary.get('end-date', {})) - organization = self.__get_value_from_keys(key_summary,["organization","name"]) - organization_address = ', '.join(filter(None, self.__get_value_from_keys(key_summary, ["organization", "address"]).values())) if self.__get_value_from_keys(key_summary, ["organization", "address"]) is not None else '' - url = self.__get_value_from_keys(key_summary,["url","value"]) + department = self.__get_value_from_keys( + key_summary, ["department-name"]) + role = self.__get_value_from_keys( + key_summary, ["role-title"]) + start_date = self.get_formatted_date( + key_summary.get('start-date', {})) + end_date = self.get_formatted_date( + key_summary.get('end-date', {})) + organization = self.__get_value_from_keys( + key_summary, ["organization", "name"]) + org_addr_obj = self.__get_value_from_keys( + key_summary, ["organization", "address"]) + if org_addr_obj is not None: + organization_address = ', '.join( + filter(None, org_addr_obj.values())) + else: + organization_address = '' + url = self.__get_value_from_keys( + key_summary, ["url", "value"]) detail = { 'Department': department, 'Role': role, @@ -104,7 +145,7 @@ def __extract_details(self, data, key): 'organization-address': organization_address, 'url': url, } - + details.append(detail) - + return details diff --git a/src/pyorcid/orcid_search.py b/src/pyorcid/orcid_search.py index 37e8f45..ccfce64 100644 --- a/src/pyorcid/orcid_search.py +++ b/src/pyorcid/orcid_search.py @@ -1,68 +1,98 @@ -import requests +from __future__ import annotations + +import logging import os from urllib import parse -class OrcidSearch(): +import requests + +logger = logging.getLogger(__name__) + + +class OrcidSearch: ''' This is a wrapper class for ORCID Search API ''' - def __init__(self, orcid_access_token = " ", state = "public", sandbox=False) -> None: - ''' - Initialize orcid search instance - state : Whether to use public or member API of ORCID - orcid_access_token : Orcid access token obtained from the user with this orcid_id (default: "public") - sandbox : bool : a boolean value to show if the ORCID sandbox API should be used (default: False) - ''' + def __init__( + self, + orcid_access_token: str = " ", + state: str = "public", + sandbox: bool = False + ) -> None: + """Initialize ORCID search instance. + + Args: + orcid_access_token: ORCID access token + state: Whether to use "public" or "member" API of ORCID + sandbox: Whether to use ORCID sandbox API for testing + + Raises: + ValueError: If access token is invalid + """ self._orcid_access_token = orcid_access_token self._state = state self._sandbox = sandbox + self._session = requests.Session() + # For testing purposes (pytesting on github workflow) - if orcid_access_token != " ": + if orcid_access_token.strip() and orcid_access_token != " ": try: self.__is_access_token_valid() - except: + except (KeyError, OSError): if not self.__test_is_access_token_valid(): raise ValueError( - f"Invalid access token! Please make sure the provided credentials are correct.") - - return - - def search(self, query, start = 0, rows = 1000, search_mode = "expanded-search", columns = "orcid,given-names,family-name,current-institution-affiliation-name"): + "Invalid access token! Please make sure the " + "provided credentials are correct." + ) + + def search( + self, + query, + start=0, + rows=1000, + search_mode="expanded-search", + columns="orcid,given-names,family-name," + "current-institution-affiliation-name" + ): ''' Search orcid records - for details on the query format see https://info.orcid.org/documentation/api-tutorials/api-tutorial-searching-the-orcid-registry/ + for details on the query format see + https://info.orcid.org/documentation/api-tutorials/ + api-tutorial-searching-the-orcid-registry/ query : the search query start : the offset for the paginated search, default = 0 rows : the number of rows to be returned, default = 1000 - search_mode : the search mode, either "expanded-search" (default), "search", or "csv-search" - columns : for the csv-search, default: "orcid,given-names,family-name,current-institution-affiliation-name" + search_mode : the search mode, either "expanded-search" (default), + "search", or "csv-search" + columns : for the csv-search, default: + "orcid,given-names,family-name, + current-institution-affiliation-name" return : a dictionary of search results ''' access_token = self._orcid_access_token - _search_mode = "expanded-search" if search_mode == "search" or search_mode == "csv-search": _search_mode = search_mode - _columns = columns query_encoded = parse.quote_plus(query) api_url = "" if self._state == "public": # Specify the ORCID record endpoint for the desired ORCID iD - api_url = f'https://pub.orcid.org/' - if (self._sandbox): - api_url = f'https://pub.sandbox.orcid.org/' # for testing + api_url = 'https://pub.orcid.org/' + if self._sandbox: + api_url = 'https://pub.sandbox.orcid.org/' # for testing elif self._state == "member": - api_url = f'https://api.orcid.org/' - if (self._sandbox): - api_url = f'https://api.sandbox.orcid.org/' # for testing + api_url = 'https://api.orcid.org/' + if self._sandbox: + api_url = 'https://api.sandbox.orcid.org/' # for testing - api_url = api_url + f'v3.0/{_search_mode}/?q={query_encoded}&start={start}&rows={rows}' + api_url = (f'{api_url}v3.0/{_search_mode}/?q={query_encoded}' + f'&start={start}&rows={rows}') content_type = 'application/json' if search_mode == "csv-search": @@ -72,11 +102,9 @@ def search(self, query, start = 0, rows = 1000, search_mode = "expanded-search", # Set the headers with the access token for authentication headers = { 'Authorization': f'Bearer {access_token}', - 'Content-Type': f'{content_type}' + 'Content-Type': content_type } - #print(api_url) - # Make a GET request to retrieve the ORCID record response = requests.get(api_url, headers=headers) @@ -87,8 +115,10 @@ def search(self, query, start = 0, rows = 1000, search_mode = "expanded-search", return data else: # Handle the case where the request failed - print("Failed to retrieve ORCID search results. Status code:", response.status_code) + print("Failed to retrieve ORCID search results. Status code:", + response.status_code) return None + def __is_access_token_valid(self): ''' Checks if the current access token is valid @@ -97,7 +127,8 @@ def __is_access_token_valid(self): if access_token == "": raise ValueError( - "Empty value for access token! Please make sure you are authenticated by ORCID as developer.") + "Empty value for access token! Please make sure you are " + "authenticated by ORCID as developer.") # Make a test request to the API using the token headers = { 'Authorization': f'Bearer {access_token}', @@ -108,14 +139,14 @@ def __is_access_token_valid(self): if self._state == "public": # Specify the ORCID record endpoint for the desired ORCID iD - api_url = f'https://pub.orcid.org/v3.0/search' - if (self._sandbox): - api_url = f'https://pub.sandbox.orcid.org/v3.0/search' # for testing + api_url = 'https://pub.orcid.org/v3.0/search' + if self._sandbox: + api_url = 'https://pub.sandbox.orcid.org/v3.0/search' elif self._state == "member": - api_url = f'https://api.orcid.org/v3.0/search' - if (self._sandbox): - api_url = f'https://api.sandbox.orcid.org/v3.0/search' # for testing + api_url = 'https://api.orcid.org/v3.0/search' + if self._sandbox: + api_url = 'https://api.sandbox.orcid.org/v3.0/search' response = requests.get(api_url, headers=headers) @@ -123,10 +154,10 @@ def __is_access_token_valid(self): # The request was successful, and the token is likely valid return False else: - # The request failed, indicating that the token may have expired or is invalid + # Token may have expired or is invalid return True - ## THESE FUNCTIONS ARE FOR TESTING PURPOSES ## + # THESE FUNCTIONS ARE FOR TESTING PURPOSES def __test_is_access_token_valid(self): ''' @@ -137,7 +168,8 @@ def __test_is_access_token_valid(self): access_token = os.environ["ORCID_ACCESS_TOKEN"] if access_token == "": raise ValueError( - "Empty value for access token! Please make sure you are authenticated by ORCID as developer.") + "Empty value for access token! Please make sure you are " + "authenticated by ORCID as developer.") # Make a test request to the API using the token headers = { 'Authorization': f'Bearer {access_token}', @@ -148,15 +180,15 @@ def __test_is_access_token_valid(self): if self._state == "public": # Specify the ORCID record endpoint for the desired ORCID iD - api_url = f'https://pub.sandbox.orcid.org/v3.0/search' + api_url = 'https://pub.sandbox.orcid.org/v3.0/search' elif self._state == "member": - api_url = f'https://api.sandbox.orcid.org/v3.0/search' + api_url = 'https://api.sandbox.orcid.org/v3.0/search' response = requests.get(api_url, headers=headers) if response.status_code == 404: # The request was successful, and the token is likely valid return False else: - # The request failed, indicating that the token may have expired or is invalid - return True \ No newline at end of file + # Token may have expired or is invalid + return True