-
Notifications
You must be signed in to change notification settings - Fork 117
Add automated type checking #538
New issue
Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.
By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.
Already on GitHub? Sign in to your account
base: main
Are you sure you want to change the base?
Changes from all commits
026be6f
f7296e5
134cc01
816c355
1392026
deb657c
File filter
Filter by extension
Conversations
Jump to
Diff view
Diff view
There are no files selected for viewing
| Original file line number | Diff line number | Diff line change |
|---|---|---|
| @@ -0,0 +1,6 @@ | ||
| version: str | ||
| __version__: str | ||
| version_tuple: tuple[int | str, ...] | ||
| __version_tuple__: tuple[int | str, ...] | ||
| commit_id: str | None | ||
| __commit_id__: str | None | ||
|
Comment on lines
+1
to
+6
Collaborator
Author
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. This covers the _version.py generated by setuptools_scm at installation time. That doesn't exist in the repository so CI complains it can't resolve typing from the utils method. You can find a similar setup in Black to handle this. |
||
| Original file line number | Diff line number | Diff line change |
|---|---|---|
|
|
@@ -9,13 +9,13 @@ | |
| import logging | ||
| import os | ||
| import re | ||
| import typing | ||
| import warnings | ||
| import weakref | ||
| from collections import defaultdict | ||
| from collections.abc import Iterable | ||
| from datetime import datetime, timedelta, timezone | ||
| from glob import has_magic | ||
| from typing import Optional, Tuple | ||
| from typing import Any, Literal | ||
| from uuid import uuid4 | ||
|
|
||
| from azure.core.exceptions import ( | ||
|
|
@@ -105,7 +105,7 @@ def get_running_loop(): | |
| return loop | ||
|
|
||
|
|
||
| def _coalesce_version_id(*args) -> Optional[str]: | ||
| def _coalesce_version_id(*args) -> str | None: | ||
| """Helper to coalesce a list of version_ids down to one""" | ||
| version_ids = set(args) | ||
| if None in version_ids: | ||
|
|
@@ -123,10 +123,10 @@ def _coalesce_version_id(*args) -> Optional[str]: | |
|
|
||
| def _create_aio_blob_service_client( | ||
| account_url: str, | ||
| location_mode: Optional[str] = None, | ||
| credential: Optional[str] = None, | ||
| location_mode: str | None = None, | ||
| credential: str | None = None, | ||
| ) -> AIOBlobServiceClient: | ||
| service_client_kwargs = { | ||
| service_client_kwargs: dict[str, Any] = { | ||
| "account_url": account_url, | ||
| "user_agent": _USER_AGENT, | ||
| } | ||
|
Comment on lines
+129
to
132
Collaborator
Author
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. This needs to be explicitly typed because values are mutated later to more than the inferred type of |
||
|
|
@@ -264,30 +264,30 @@ class AzureBlobFileSystem(AsyncFileSystem): | |
|
|
||
| def __init__( | ||
| self, | ||
| account_name: str = None, | ||
| account_key: str = None, | ||
| connection_string: str = None, | ||
| credential: str = None, | ||
| sas_token: str = None, | ||
| account_name: str | None = None, | ||
| account_key: str | None = None, | ||
| connection_string: str | None = None, | ||
| credential: str | None = None, | ||
| sas_token: str | None = None, | ||
| request_session=None, | ||
| socket_timeout=_SOCKET_TIMEOUT_DEFAULT, | ||
| blocksize: int = _DEFAULT_BLOCK_SIZE, | ||
| client_id: str = None, | ||
| client_secret: str = None, | ||
| tenant_id: str = None, | ||
| anon: bool = None, | ||
| client_id: str | None = None, | ||
| client_secret: str | None = None, | ||
| tenant_id: str | None = None, | ||
| anon: bool | None = None, | ||
| location_mode: str = "primary", | ||
| loop=None, | ||
| asynchronous: bool = False, | ||
| default_fill_cache: bool = True, | ||
| default_cache_type: str = "bytes", | ||
| version_aware: bool = False, | ||
| assume_container_exists: Optional[bool] = None, | ||
| max_concurrency: Optional[int] = None, | ||
| timeout: Optional[int] = None, | ||
| connection_timeout: Optional[int] = None, | ||
| read_timeout: Optional[int] = None, | ||
| account_host: str = None, | ||
| assume_container_exists: bool | None = None, | ||
| max_concurrency: int | None = None, | ||
| timeout: int | None = None, | ||
| connection_timeout: int | None = None, | ||
| read_timeout: int | None = None, | ||
| account_host: str | None = None, | ||
| **kwargs, | ||
| ): | ||
| self.kwargs = kwargs.copy() | ||
|
|
@@ -386,13 +386,15 @@ def __init__( | |
| weakref.finalize(self, sync, self.loop, close_credential, self) | ||
|
|
||
| if max_concurrency is None: | ||
| batch_size = _get_batch_size() | ||
| batch_size: int = _get_batch_size() # type: ignore[assignment] | ||
|
Collaborator
Author
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. We know this is an number in all cases (we do numerical comparison on the next line) but We could add special handling for None if we want, I just wasn't able to find a way to actually trigger it for adlfs. |
||
| if batch_size > 0: | ||
| max_concurrency = batch_size | ||
| else: | ||
| max_concurrency = 1 | ||
|
Comment on lines
+392
to
+393
Collaborator
Author
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. This is resolving a configuration edge case. This change moves us to single request throughput (also the SDK default) if we can't determine a batch_size from the OS. That is what would have been intended by
Collaborator
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. I'm a little hesitant to change this fallback value as part of supporting type checking, especially if type checking can pass without this change. Mainly I would want to understand the intention of a return value of zero or negative from
Collaborator
Author
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. I have a PR open with the Azure SDK to hopefully help fix this issue. The APIs are inconsistently typed, some allow None, others don't. A different subset actually works with None (this is one of them). Once we get that aligned, we should be fine to remove this piece.
Collaborator
Author
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. We got 3 PRs merged upstream. Once the next Python SDK release goes out, we should be able to remove this. |
||
| self.max_concurrency = max_concurrency | ||
|
|
||
| @classmethod | ||
| def _strip_protocol(cls, path: str): | ||
| def _strip_protocol(cls, path: str) -> str: | ||
| """ | ||
| Remove the protocol from the input path | ||
|
|
||
|
|
@@ -407,7 +409,7 @@ def _strip_protocol(cls, path: str): | |
| Returns a path without the protocol | ||
| """ | ||
| if isinstance(path, list): | ||
| return [cls._strip_protocol(p) for p in path] | ||
| return [cls._strip_protocol(p) for p in path] # type: ignore[return-value] | ||
|
Collaborator
Author
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. We already explicitly require We can introduce a change that records a return type of |
||
|
|
||
| STORE_SUFFIX = ".dfs.core.windows.net" | ||
| logger.debug(f"_strip_protocol for {path}") | ||
|
|
@@ -473,6 +475,16 @@ def _get_credential_from_service_principal(self): | |
| ------- | ||
| Tuple of (Async Credential, Sync Credential). | ||
| """ | ||
| if ( | ||
| self.tenant_id is None | ||
| or self.client_id is None | ||
| or self.client_secret is None | ||
| ): | ||
| raise ValueError( | ||
| "tenant_id, client_id, and client_secret must all be provided " | ||
| "when authenticating with a service principal." | ||
| ) | ||
|
Comment on lines
+478
to
+486
Collaborator
Author
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. This is an existing failure mode. You can pass just |
||
|
|
||
| from azure.identity import ClientSecretCredential | ||
| from azure.identity.aio import ( | ||
| ClientSecretCredential as AIOClientSecretCredential, | ||
|
|
@@ -573,7 +585,7 @@ def do_connect(self): | |
|
|
||
| def split_path( | ||
| self, path, delimiter="/", return_container: bool = False, **kwargs | ||
| ) -> Tuple[str, str, Optional[str]]: | ||
| ) -> tuple[str, str, str | None]: | ||
| """ | ||
| Normalize ABFS path string into bucket and key. | ||
|
|
||
|
|
@@ -708,7 +720,7 @@ async def _ls_blobs( | |
| path: str, | ||
| delimiter: str = "/", | ||
| return_glob: bool = False, | ||
| version_id: Optional[str] = None, | ||
| version_id: str | None = None, | ||
| versions: bool = False, | ||
| **kwargs, | ||
| ): | ||
|
|
@@ -799,7 +811,7 @@ async def _ls( | |
| invalidate_cache: bool = False, | ||
| delimiter: str = "/", | ||
| return_glob: bool = False, | ||
| version_id: Optional[str] = None, | ||
| version_id: str | None = None, | ||
| versions: bool = False, | ||
| **kwargs, | ||
| ): | ||
|
|
@@ -867,7 +879,7 @@ async def _details( | |
| delimiter="/", | ||
| return_glob: bool = False, | ||
| target_path="", | ||
| version_id: Optional[str] = None, | ||
| version_id: str | None = None, | ||
| versions: bool = False, | ||
| **kwargs, | ||
| ): | ||
|
|
@@ -1195,9 +1207,9 @@ def makedir(self, path, exist_ok=False): | |
|
|
||
| async def _rm( | ||
| self, | ||
| path: typing.Union[str, typing.List[str]], | ||
| path: str | list[str], | ||
| recursive: bool = False, | ||
| maxdepth: typing.Optional[int] = None, | ||
| maxdepth: int | None = None, | ||
| delimiter: str = "/", | ||
| expand_path: bool = True, | ||
| **kwargs, | ||
|
|
@@ -1256,9 +1268,7 @@ async def _rm( | |
|
|
||
| rm = sync_wrapper(_rm) | ||
|
|
||
| async def _rm_files( | ||
| self, container_name: str, file_paths: typing.Iterable[str], **kwargs | ||
| ): | ||
| async def _rm_files(self, container_name: str, file_paths: Iterable[str], **kwargs): | ||
| """ | ||
| Delete the given file(s) | ||
|
|
||
|
|
@@ -1322,8 +1332,8 @@ async def _rm_file(self, path: str, **kwargs): | |
| self.invalidate_cache(self._parent(path)) | ||
|
|
||
| async def _separate_directory_markers_for_non_empty_directories( | ||
| self, file_paths: typing.Iterable[str] | ||
| ) -> typing.Tuple[typing.List[str], typing.List[str]]: | ||
| self, file_paths: Iterable[str] | ||
| ) -> tuple[list[str], list[str]]: | ||
| """ | ||
| Distinguish directory markers of non-empty directories from files and directory markers for empty directories. | ||
| A directory marker is an empty blob who's name is the path of the directory. | ||
|
|
@@ -1635,6 +1645,12 @@ async def _url( | |
| account_name = self.account_name | ||
| account_key = self.account_key | ||
|
|
||
| if account_name is None: | ||
| raise ValueError( | ||
| "account_name is required to generate a SAS URL. " | ||
| "Provide account_name or include AccountName in the connection string." | ||
| ) | ||
|
Comment on lines
+1648
to
+1652
Collaborator
Author
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. Same deal about downstream requirements. If account_name is None, the string concatenation blows up because you can't add None to a string. This will raise a clearer error.
Collaborator
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. More for my curiosity, were you able to raise this new exception purely through setting values in the initializer? I was only able to reach it by mutating the |
||
|
|
||
| sas_token = generate_blob_sas( | ||
| account_name=account_name, | ||
| container_name=container_name, | ||
|
|
@@ -1653,8 +1669,10 @@ async def _url( | |
| url = f"{bc.url}?{sas_token}" | ||
| return url | ||
|
|
||
| def expand_path(self, path, recursive=False, maxdepth=None, skip_noexist=True): | ||
| return sync( | ||
| def expand_path( | ||
| self, path, recursive=False, maxdepth=None, skip_noexist=True | ||
| ) -> list[str]: | ||
| return sync( # type: ignore[return-value] | ||
|
Collaborator
Author
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more.
For the time being, we know this is the same return type, so it's ignored. |
||
| self.loop, self._expand_path, path, recursive, maxdepth, skip_noexist | ||
| ) | ||
|
|
||
|
|
@@ -1887,12 +1905,12 @@ def _open( | |
| self, | ||
| path: str, | ||
| mode: str = "rb", | ||
| block_size: int = None, | ||
| block_size: int | None = None, | ||
| autocommit: bool = True, | ||
| cache_options: dict = {}, | ||
| cache_type="readahead", | ||
| metadata=None, | ||
| version_id: Optional[str] = None, | ||
| version_id: str | None = None, | ||
| **kwargs, | ||
| ): | ||
| """Open a file on the datalake, or a block blob | ||
|
|
@@ -1954,12 +1972,12 @@ def __init__( | |
| fs: AzureBlobFileSystem, | ||
| path: str, | ||
| mode: str = "rb", | ||
| block_size="default", | ||
| block_size: int | Literal["default"] | None = "default", | ||
| autocommit: bool = True, | ||
| cache_type: str = "bytes", | ||
| cache_options: dict = {}, | ||
| metadata=None, | ||
| version_id: Optional[str] = None, | ||
| version_id: str | None = None, | ||
| **kwargs, | ||
| ): | ||
| """ | ||
|
|
@@ -2017,9 +2035,10 @@ def __init__( | |
|
|
||
| self.loop = self._get_loop() | ||
| self.container_client = self._get_container_client() | ||
| self.blocksize = ( | ||
| self.DEFAULT_BLOCK_SIZE if block_size in ["default", None] else block_size | ||
| ) | ||
| if block_size == "default" or block_size is None: | ||
| self.blocksize: int = self.DEFAULT_BLOCK_SIZE | ||
| else: | ||
| self.blocksize = block_size | ||
|
Comment on lines
+2038
to
+2041
Collaborator
Author
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. This is a limitation of pyright. The code doesn't have any notable performance changes, it just allows the type checker to reason. pyright isn't currently able to extrapolate the |
||
| self.loc = 0 | ||
| self.autocommit = autocommit | ||
| self.end = None | ||
|
|
@@ -2127,9 +2146,9 @@ def connect_client(self): | |
| """ | ||
| try: | ||
| if hasattr(self.fs, "account_host"): | ||
| self.fs.account_url: str = f"https://{self.fs.account_host}" | ||
| self.fs.account_url = f"https://{self.fs.account_host}" | ||
|
Collaborator
Author
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. These aren't valid type definitions because they're modifying |
||
| else: | ||
| self.fs.account_url: str = ( | ||
| self.fs.account_url = ( | ||
| f"https://{self.fs.account_name}.blob.core.windows.net" | ||
| ) | ||
|
|
||
|
|
@@ -2164,7 +2183,7 @@ def connect_client(self): | |
| f"Unable to fetch container_client with provided params for {e}!!" | ||
| ) from e | ||
|
|
||
| async def _async_fetch_range(self, start: int, end: int = None, **kwargs): | ||
| async def _async_fetch_range(self, start: int, end: int | None = None, **kwargs): | ||
| """ | ||
| Download a chunk of data specified by start and end | ||
|
|
||
|
|
@@ -2221,7 +2240,7 @@ async def _stage_block(self, data, start, end, block_id, semaphore): | |
| async with self.container_client.get_blob_client(blob=self.blob) as bc: | ||
| await bc.stage_block( | ||
| block_id=block_id, | ||
| data=data[start:end], | ||
| data=data[start:end], # type: ignore[arg-type] | ||
|
Collaborator
Author
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. I talked with the Azure Python SDK folks and am going to open a PR about this. The current definition doesn't support memoryview or bytearray which are both valid inputs. Once that's fixed there, this ignore is no longer needed. |
||
| length=end - start, | ||
| ) | ||
| return block_id | ||
|
|
@@ -2301,7 +2320,7 @@ async def _async_upload_chunk(self, final: bool = False, **kwargs): | |
| await bc.upload_blob( | ||
| data=data, | ||
| length=length, | ||
| blob_type=BlobType.AppendBlob, | ||
| blob_type=BlobType.APPENDBLOB, | ||
|
Collaborator
Author
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more.
|
||
| metadata=self.metadata, | ||
| ) | ||
| else: | ||
|
|
@@ -2329,6 +2348,6 @@ def __getstate__(self): | |
| return state | ||
|
|
||
| def __setstate__(self, state): | ||
| self.__dict__.update(state) | ||
| self.__dict__.update(state) # type: ignore[reportAttributeAccessIssue] | ||
| self.loop = self._get_loop() | ||
| self.container_client = self._get_container_client() | ||
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
Is there a particular reason this was not added to the pre-commit hook and
requirements/dev.txtfile? Was it for allowing to continue when there is an error? Seems like that is where most of the code checkers are and it would also mean anyone who is use the pre-commit hooks will be able to run the type checker as well.There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
No we can definitely add them there. This was originally written to get testing bootstrapped on a machine other than my local one but we should make sure it's easily reproducible everywhere.
The continue-on-error was to not block PRs just because there was a typing issue. Whether we want to keep that, I'm indifferent on. What I wanted to avoid was people adding #ignore or making the types potentially worse to get around the typechecker.