Skip to content
Open
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
54 changes: 54 additions & 0 deletions bindings/python/py_src/tokenizers/__init__.pyi
Original file line number Diff line number Diff line change
Expand Up @@ -1614,6 +1614,60 @@ class Tokenizer:
:class:`~tokenizers.Encoding`: The final post-processed encoding
"""
pass
def post_process_tokens(
self,
/,
tokens: list[str],
pair: list[str] | None = None,
add_special_tokens: bool = True,
) -> list[str]:
"""
Post-process a list of tokens (and optionally a pair) and return the processed tokens.

This is a simplified interface that only handles the token strings, without the full
Encoding information. Useful for step-by-step tokenization.

Args:
tokens (:obj:`List[str]`):
The main sequence of tokens

pair (:obj:`List[str]`, `optional`):
An optional pair sequence of tokens

add_special_tokens (:obj:`bool`, defaults to :obj:`True`):
Whether to add special tokens

Returns:
:obj:`List[str]`: A list of tokens with special tokens added according to the post-processor
"""
...
def post_process_ids(
self,
/,
ids: list[int],
pair: list[int] | None = None,
add_special_tokens: bool = True,
) -> list[int]:
"""
Post-process a list of token IDs (and optionally a pair) and return the processed IDs.

This is a simplified interface that only handles the token IDs, without the full
Encoding information. Useful for step-by-step tokenization.

Args:
ids (:obj:`List[int]`):
The main sequence of token IDs

pair (:obj:`List[int]`, `optional`):
An optional pair sequence of token IDs

add_special_tokens (:obj:`bool`, defaults to :obj:`True`):
Whether to add special tokens

Returns:
:obj:`List[int]`: A list of token IDs with special tokens added according to the post-processor
"""
...

@property
def post_processor(self):
Expand Down
54 changes: 54 additions & 0 deletions bindings/python/py_src/tokenizers/processors/__init__.pyi
Original file line number Diff line number Diff line change
Expand Up @@ -117,6 +117,60 @@ class BertProcessing(PostProcessor):
:class:`~tokenizers.Encoding`: The final encoding
"""
pass
def process_tokens(
self,
/,
tokens: list[str],
pair: list[str] | None = None,
add_special_tokens: bool = True,
) -> list[str]:
"""
Process a list of tokens (and optionally a pair) and return the processed tokens.

This is a simplified interface that only handles the token strings, without the full
Encoding information. Useful for step-by-step tokenization.

Args:
tokens (:obj:`List[str]`):
The main sequence of tokens

pair (:obj:`List[str]`, `optional`):
An optional pair sequence of tokens

add_special_tokens (:obj:`bool`, defaults to :obj:`True`):
Whether to add special tokens

Returns:
:obj:`List[str]`: A list of tokens with special tokens added
"""
...
def process_ids(
self,
/,
ids: list[int],
pair: list[int] | None = None,
add_special_tokens: bool = True,
) -> list[int]:
"""
Process a list of token IDs (and optionally a pair) and return the processed IDs.

This is a simplified interface that only handles the token IDs, without the full
Encoding information. Useful for step-by-step tokenization.

Args:
ids (:obj:`List[int]`):
The main sequence of token IDs

pair (:obj:`List[int]`, `optional`):
An optional pair sequence of token IDs

add_special_tokens (:obj:`bool`, defaults to :obj:`True`):
Whether to add special tokens

Returns:
:obj:`List[int]`: A list of token IDs with special tokens added
"""
...

@property
def sep(self):
Expand Down
130 changes: 130 additions & 0 deletions bindings/python/src/processors.rs
Original file line number Diff line number Diff line change
Expand Up @@ -94,6 +94,26 @@ impl PostProcessor for PyPostProcessor {
self.processor
.process_encodings(encodings, add_special_tokens)
}

fn process_tokens(
&self,
tokens: Vec<String>,
pair_tokens: Option<Vec<String>>,
add_special_tokens: bool,
) -> tk::Result<Vec<String>> {
self.processor
.process_tokens(tokens, pair_tokens, add_special_tokens)
}

fn process_ids(
&self,
ids: Vec<u32>,
pair_ids: Option<Vec<u32>>,
add_special_tokens: bool,
) -> tk::Result<Vec<u32>> {
self.processor
.process_ids(ids, pair_ids, add_special_tokens)
}
}

#[pymethods]
Expand Down Expand Up @@ -165,6 +185,66 @@ impl PyPostProcessor {
Ok(final_encoding.into())
}

/// Process a list of tokens (and optionally a pair) and return the processed tokens.
///
/// This is a simplified interface that only handles the token strings, without the full
/// Encoding information. Useful for step-by-step tokenization.
///
/// Args:
/// tokens (:obj:`List[str]`):
/// The main sequence of tokens
///
/// pair (:obj:`List[str]`, `optional`):
/// An optional pair sequence of tokens
///
/// add_special_tokens (:obj:`bool`, defaults to :obj:`True`):
/// Whether to add special tokens
///
/// Returns:
/// :obj:`List[str]`: A list of tokens with special tokens added
#[pyo3(signature = (tokens, pair = None, add_special_tokens = true))]
#[pyo3(text_signature = "(self, tokens, pair=None, add_special_tokens=True)")]
fn process_tokens(
&self,
tokens: Vec<String>,
pair: Option<Vec<String>>,
add_special_tokens: bool,
) -> PyResult<Vec<String>> {
ToPyResult(
self.processor
.process_tokens(tokens, pair, add_special_tokens),
)
.into()
}

/// Process a list of token IDs (and optionally a pair) and return the processed IDs.
///
/// This is a simplified interface that only handles the token IDs, without the full
/// Encoding information. Useful for step-by-step tokenization.
///
/// Args:
/// ids (:obj:`List[int]`):
/// The main sequence of token IDs
///
/// pair (:obj:`List[int]`, `optional`):
/// An optional pair sequence of token IDs
///
/// add_special_tokens (:obj:`bool`, defaults to :obj:`True`):
/// Whether to add special tokens
///
/// Returns:
/// :obj:`List[int]`: A list of token IDs with special tokens added
#[pyo3(signature = (ids, pair = None, add_special_tokens = true))]
#[pyo3(text_signature = "(self, ids, pair=None, add_special_tokens=True)")]
fn process_ids(
&self,
ids: Vec<u32>,
pair: Option<Vec<u32>>,
add_special_tokens: bool,
) -> PyResult<Vec<u32>> {
ToPyResult(self.processor.process_ids(ids, pair, add_special_tokens)).into()
}

fn __repr__(&self) -> PyResult<String> {
crate::utils::serde_pyo3::repr(self)
.map_err(|e| exceptions::PyException::new_err(e.to_string()))
Expand Down Expand Up @@ -258,6 +338,56 @@ impl PostProcessor for PyPostProcessorTypeWrapper {
},
}
}

fn process_tokens(
&self,
mut tokens: Vec<String>,
mut pair_tokens: Option<Vec<String>>,
add_special_tokens: bool,
) -> tk::Result<Vec<String>> {
match self {
PyPostProcessorTypeWrapper::Single(inner) => inner
.read()
.map_err(|_| PyException::new_err("RwLock synchronisation primitive is poisoned, cannot get subtype of PyPreTokenizer"))?
.process_tokens(tokens, pair_tokens, add_special_tokens),
PyPostProcessorTypeWrapper::Sequence(inner) => {
for processor in inner.iter() {
let result = processor
.read()
.map_err(|_| PyException::new_err("RwLock synchronisation primitive is poisoned, cannot get subtype of PyPreTokenizer"))?
.process_tokens(tokens, pair_tokens, add_special_tokens)?;
tokens = result;
pair_tokens = None;
}
Ok(tokens)
},
}
}

fn process_ids(
&self,
mut ids: Vec<u32>,
mut pair_ids: Option<Vec<u32>>,
add_special_tokens: bool,
) -> tk::Result<Vec<u32>> {
match self {
PyPostProcessorTypeWrapper::Single(inner) => inner
.read()
.map_err(|_| PyException::new_err("RwLock synchronisation primitive is poisoned, cannot get subtype of PyPreTokenizer"))?
.process_ids(ids, pair_ids, add_special_tokens),
PyPostProcessorTypeWrapper::Sequence(inner) => {
for processor in inner.iter() {
let result = processor
.read()
.map_err(|_| PyException::new_err("RwLock synchronisation primitive is poisoned, cannot get subtype of PyPreTokenizer"))?
.process_ids(ids, pair_ids, add_special_tokens)?;
ids = result;
pair_ids = None;
}
Ok(ids)
},
}
}
}

impl<'de> Deserialize<'de> for PyPostProcessorTypeWrapper {
Expand Down
64 changes: 64 additions & 0 deletions bindings/python/src/tokenizer.rs
Original file line number Diff line number Diff line change
Expand Up @@ -1735,6 +1735,70 @@ impl PyTokenizer {
.into()
}

/// Post-process a list of tokens (and optionally a pair) and return the processed tokens.
///
/// This is a simplified interface that only handles the token strings, without the full
/// Encoding information. Useful for step-by-step tokenization.
///
/// Args:
/// tokens (:obj:`List[str]`):
/// The main sequence of tokens
///
/// pair (:obj:`List[str]`, `optional`):
/// An optional pair sequence of tokens
///
/// add_special_tokens (:obj:`bool`, defaults to :obj:`True`):
/// Whether to add special tokens
///
/// Returns:
/// :obj:`List[str]`: A list of tokens with special tokens added according to the post-processor
#[pyo3(signature = (tokens, pair=None, add_special_tokens=true))]
#[pyo3(text_signature = "(self, tokens, pair=None, add_special_tokens=True)")]
fn post_process_tokens(
&self,
tokens: Vec<String>,
pair: Option<Vec<String>>,
add_special_tokens: bool,
) -> PyResult<Vec<String>> {
ToPyResult(
self.tokenizer
.post_process_tokens(tokens, pair, add_special_tokens),
)
.into()
}

/// Post-process a list of token IDs (and optionally a pair) and return the processed IDs.
///
/// This is a simplified interface that only handles the token IDs, without the full
/// Encoding information. Useful for step-by-step tokenization.
///
/// Args:
/// ids (:obj:`List[int]`):
/// The main sequence of token IDs
///
/// pair (:obj:`List[int]`, `optional`):
/// An optional pair sequence of token IDs
///
/// add_special_tokens (:obj:`bool`, defaults to :obj:`True`):
/// Whether to add special tokens
///
/// Returns:
/// :obj:`List[int]`: A list of token IDs with special tokens added according to the post-processor
#[pyo3(signature = (ids, pair=None, add_special_tokens=true))]
#[pyo3(text_signature = "(self, ids, pair=None, add_special_tokens=True)")]
fn post_process_ids(
&self,
ids: Vec<u32>,
pair: Option<Vec<u32>>,
add_special_tokens: bool,
) -> PyResult<Vec<u32>> {
ToPyResult(
self.tokenizer
.post_process_ids(ids, pair, add_special_tokens),
)
.into()
}

/// The :class:`~tokenizers.models.Model` in use by the Tokenizer
#[getter]
fn get_model(&self, py: Python<'_>) -> PyResult<Py<PyAny>> {
Expand Down
Loading
Loading