From 528830a62f4510710a2fcc09fe312b522fde7772 Mon Sep 17 00:00:00 2001 From: sebastianMindee <130448732+sebastianMindee@users.noreply.github.com> Date: Tue, 21 Apr 2026 14:03:12 +0200 Subject: [PATCH 1/4] :sparkles: add support for V2 crop and split operations --- .gitignore | 7 +- .../MultiReceiptsAutoExtractionExample.php | 2 +- src/Error/MindeeInputException.php | 10 + src/Extraction/ExtractedImage.php | 59 ++++-- src/Extraction/ExtractedPdf.php | 47 ++--- src/Extraction/ImageExtractor.php | 184 +++++++++++------- src/Extraction/PdfExtractor.php | 68 +++---- src/V1/Image/ImageExtractor.php | 12 ++ src/V2/FileOperations/Crop.php | 67 +++++++ src/V2/FileOperations/CropFiles.php | 58 ++++++ src/V2/FileOperations/Split.php | 73 +++++++ src/V2/FileOperations/SplitFiles.php | 57 ++++++ .../DependencyCheckerNoExtendedTestPdf.php | 6 +- tests/V1/Extraction/ImageExtractorTest.php | 8 +- tests/V2/FileOperations/CropFunctional.php | 104 ++++++++++ tests/V2/FileOperations/CropTest.php | 68 +++++++ tests/V2/FileOperations/SplitFunctional.php | 95 +++++++++ tests/V2/FileOperations/SplitTest.php | 55 ++++++ 18 files changed, 830 insertions(+), 150 deletions(-) create mode 100644 src/Error/MindeeInputException.php create mode 100644 src/V1/Image/ImageExtractor.php create mode 100644 src/V2/FileOperations/Crop.php create mode 100644 src/V2/FileOperations/CropFiles.php create mode 100644 src/V2/FileOperations/Split.php create mode 100644 src/V2/FileOperations/SplitFiles.php create mode 100644 tests/V2/FileOperations/CropFunctional.php create mode 100644 tests/V2/FileOperations/CropTest.php create mode 100644 tests/V2/FileOperations/SplitFunctional.php create mode 100644 tests/V2/FileOperations/SplitTest.php diff --git a/.gitignore b/.gitignore index 514fd563..818808e9 100644 --- a/.gitignore +++ b/.gitignore @@ -21,8 +21,6 @@ composer.lock .env.local _test.php .php-cs-fixer.cache -local_test/ -LocalTest.php .phpunit* *.log /build @@ -30,3 +28,8 @@ LocalTest.php .phpdoc docs/_build tests/LocalTestNotUnit.php + +# For local testing +/output/ +local_test/ +LocalTest.php diff --git a/examples/MultiReceiptsAutoExtractionExample.php b/examples/MultiReceiptsAutoExtractionExample.php index 3a921404..05f07282 100644 --- a/examples/MultiReceiptsAutoExtractionExample.php +++ b/examples/MultiReceiptsAutoExtractionExample.php @@ -1,7 +1,7 @@ image = $image; $this->filename = $filename; $this->saveFormat = $saveFormat; + $this->pageId = $pageIndex; + $this->elementId = $index; } /** * Writes the image to a file. * Uses the default image format and filename. * - * @param string $outputPath The output directory (must exist). + * @param string $outputPath The output directory (must exist). + * @param null|string $format The image format to use. Defaults to the save format if not provided. + * @param integer $quality Quality of the saved image. + * * @return void * @throws \ImagickException Throws if the image can't be processed. */ - public function writeToFile(string $outputPath): void + public function writeToFile(string $outputPath, ?string $format = null, int $quality = 100): void { $imagePath = $outputPath . DIRECTORY_SEPARATOR . $this->filename; - $format = $this->getEncodedImageFormat($this->saveFormat); + $format = $this->getEncodedImageFormat($format ?? $this->saveFormat); $this->image->setImageFormat($format); + $this->image->stripImage(); + $quality = min(100, max(0, $quality)); + if ('png' === $format) { + $finalQuality = round($quality * 0.09); + $this->image->setOption('png:compression-level', $finalQuality); + } elseif (in_array($format, ['jpg', 'jpeg'])) { + $this->image->setImageCompression(\Imagick::COMPRESSION_JPEG); + } + $this->image->setImageCompressionQuality($quality); $this->image->writeImage($imagePath); } /** * Returns the image in a format suitable for sending to a client for parsing. * - * @throws \ImagickException Throws if the image can't be processed. * @return BytesInput Bytes input for the image. + * + * @throws \ImagickException Throws if the image can't be processed. */ public function asInputSource(): BytesInput { $format = $this->getEncodedImageFormat($this->saveFormat); $this->image->setImageFormat($format); + return new BytesInput($this->image->getImageBlob(), $this->filename); } @@ -82,7 +105,7 @@ public function asInputSource(): BytesInput * Get the encoded image format. * * @param string $saveFormat Format to save the file as. - * @return string + * @return string Encoded image format. */ private function getEncodedImageFormat(string $saveFormat): string { diff --git a/src/Extraction/ExtractedPdf.php b/src/Extraction/ExtractedPdf.php index 1a34ee1b..bffca188 100644 --- a/src/Extraction/ExtractedPdf.php +++ b/src/Extraction/ExtractedPdf.php @@ -16,25 +16,22 @@ class ExtractedPdf { /** - * File object for an ExtractedPdf. - * - * @var string + * @var string name of the original file */ - protected string $pdfBytes; + public string $filename; /** - * Name of the original file. - * - * @var string + * File object for an ExtractedPdf. */ - protected string $filename; + protected string $pdfBytes; /** * Initializes a new instance of the ExtractedPdf class. * - * @param string $pdfBytes A binary string representation of the PDF. - * @param string $filename Name of the original file. - * @throws MindeeUnhandledException Throws if PDF operations aren't supported. + * @param string $pdfBytes a binary string representation of the PDF + * @param string $filename name of the original file + * + * @throws MindeeUnhandledException throws if PDF operations aren't supported */ public function __construct(string $pdfBytes, string $filename) { @@ -47,16 +44,18 @@ public function __construct(string $pdfBytes, string $filename) /** * Wrapper for pdf GetPageCount(). * - * @return integer The number of pages in the file. - * @throws MindeePDFException Throws if FPDI is unable to process the file. + * @return int the number of pages in the file + * + * @throws MindeePDFException throws if FPDI is unable to process the file */ public function getPageCount(): int { try { - $pdfHandle = new FPDI(); + $pdfHandle = new Fpdi(); $tempFilename = tempnam(sys_get_temp_dir(), 'extracted_pdf_'); file_put_contents($tempFilename, $this->pdfBytes); + return $pdfHandle->setSourceFile($tempFilename); } catch (PdfParserException $e) { throw new MindeePDFException( @@ -70,14 +69,18 @@ public function getPageCount(): int /** * Write the PDF to a file. * - * @param string $outputPath The output directory (must exist). - * @return void + * @param string $outputPath the output directory (must exist) */ public function writeToFile(string $outputPath): void { - $pdfPath = $outputPath . DIRECTORY_SEPARATOR . $this->filename; - if (basename($outputPath) !== '') { - $pdfPath = realpath($outputPath); + $pdfPath = $outputPath.DIRECTORY_SEPARATOR.$this->filename; + if ('' !== basename($outputPath)) { + if (!($pdfPath = realpath($outputPath))) { + $pdfPath = $outputPath; + } + } + if (!str_ends_with(strtolower($pdfPath), 'pdf')) { + $pdfPath .= '.pdf'; } file_put_contents($pdfPath, $this->pdfBytes); } @@ -85,7 +88,7 @@ public function writeToFile(string $outputPath): void /** * Return the file in a format suitable for sending to MindeeClient for parsing. * - * @return BytesInput Bytes input for the image. + * @return BytesInput bytes input for the image */ public function asInputSource(): BytesInput { @@ -93,7 +96,7 @@ public function asInputSource(): BytesInput } /** - * @return string The pdf bytes. + * @return string the pdf bytes */ public function getPdfBytes(): string { @@ -101,7 +104,7 @@ public function getPdfBytes(): string } /** - * @return string The name of the file. + * @return string the name of the file */ public function getFilename(): string { diff --git a/src/Extraction/ImageExtractor.php b/src/Extraction/ImageExtractor.php index 37dcab68..a1a39d23 100644 --- a/src/Extraction/ImageExtractor.php +++ b/src/Extraction/ImageExtractor.php @@ -6,9 +6,9 @@ use Mindee\Error\MindeeGeometryException; use Mindee\Error\MindeeImageException; use Mindee\Error\MindeePDFException; -use Mindee\Error\MindeeUnhandledException; use Mindee\Geometry\BBox; use Mindee\Geometry\BBoxUtils; +use Mindee\Geometry\Polygon; use Mindee\Input\LocalInputSource; use Mindee\Parsing\DependencyChecker; use Mindee\Parsing\Standard\BaseField; @@ -19,36 +19,30 @@ class ImageExtractor { /** - * Array of extracted page images. - * - * @var array + * @var \Imagick[] Array of extracted page images. */ - private array $pageImages = []; + protected array $pageImages = []; + /** - * Name of the file. - * - * @var string + * @var string Name of the file. */ - private string $filename; + protected string $filename; + /** - * Format to save the image as. - * - * @var string + * @var string Format to save the image as. */ - private string $saveFormat; + protected string $saveFormat; + /** - * Local input object used by the ImageExtractor. - * - * @var LocalInputSource + * @var LocalInputSource Local input object used by the ImageExtractor. */ protected LocalInputSource $inputSource; - /** - * @param LocalInputSource $localInput Local Input, accepts all compatible formats. - * @param string|null $saveFormat Save format, will be coerced to jpg by default. - * @throws MindeeUnhandledException|MindeePDFException Throws if PDF operations aren't supported, - * or if the file can't be read, respectively. + * @param LocalInputSource $localInput Local input, accepts all compatible formats. + * @param null|string $saveFormat Save format, will be coerced to jpg by default. + * + * @throws MindeePDFException Throws if PDF operations aren't supported, or if the file can't be read, respectively. */ public function __construct(LocalInputSource $localInput, ?string $saveFormat = null) { @@ -58,8 +52,8 @@ public function __construct(LocalInputSource $localInput, ?string $saveFormat = $this->inputSource = $localInput; $extension = pathinfo($localInput->fileName, PATHINFO_EXTENSION); - if ($saveFormat === null) { - if ($extension && strtolower($extension) !== 'pdf') { + if (null === $saveFormat) { + if ($extension && 'pdf' !== strtolower($extension)) { $this->saveFormat = $extension; } else { $this->saveFormat = 'jpg'; @@ -89,7 +83,9 @@ public function __construct(LocalInputSource $localInput, ?string $saveFormat = * Renders the input PDF's pages as individual images. * * @param string $fileBytes Input pdf. - * @return array A list of pages. + * + * @return \Imagick[] A list of pages. + * * @throws MindeeImageException Throws if the image can't be handled. */ public static function pdfToImages(string $fileBytes): array @@ -100,7 +96,7 @@ public static function pdfToImages(string $fileBytes): array $imagick->readImageBlob($fileBytes); foreach ($imagick as $page) { - $page->setImageFormat('png'); + $page->setImageFormat('jpg'); $images[] = $page; } @@ -116,8 +112,7 @@ public static function pdfToImages(string $fileBytes): array /** * Gets the number of pages in the file. - * - * @return integer + * @return integer Page count. */ public function getPageCount(): int { @@ -129,8 +124,9 @@ public function getPageCount(): int * * @param array $fields List of Fields to extract. * @param integer $pageIndex The page index to extract, begins at 0. - * @param string|null $outputName The base output filename, must have an image extension. - * @return array A list of extracted images. + * @param null|string $outputName The base output filename, must have an image extension. + * + * @return array a list of extracted images */ public function extractImagesFromPage(array $fields, int $pageIndex, ?string $outputName = null): array { @@ -141,29 +137,52 @@ public function extractImagesFromPage(array $fields, int $pageIndex, ?string $ou /** * Extracts images from a page. * - * @param array $fields List of Fields to extract. - * @param integer $pageIndex The page index to extract, begins at 0. - * @param string $outputName Name of the created file. - * @return array An array of created images. + * @param array $polygons List of polygons to extract. + * @param integer $pageIndex The page index to extract, begins at 0. + * @param null|string $format Save format for extracted images. Defaults to the original format. + * + * @return array an array of created images */ - private function extractFromPage(array $fields, int $pageIndex, string $outputName): array + public function extractPolygonsFromPage(array $polygons, int $pageIndex, ?string $format = null): array { - $splitName = $this->splitNameStrict($outputName); - $filename = sprintf("%s_page-%03d.%s", $splitName[0], $pageIndex + 1, $this->saveFormat); + $saveFormat = $format ?? $this->saveFormat; $extractedImages = []; - $i = 0; - foreach ($fields as $field) { - $extractedImage = $this->extractImage($field, $pageIndex, $i + 1, $filename); - if ($extractedImage !== null) { - $extractedImages[] = $extractedImage; - } - $i++; + foreach ($polygons as $i => $polygon) { + $extractedImages[] = $this->extractPolygonFromPage($polygon, $pageIndex, $i, null, $format); } return $extractedImages; } + /** + * Extracts a cropped portion from an image. + * + * @param Polygon $polygon Polygon to extract. + * @param integer $pageIndex Page index to extract from. + * @param integer $index Index to use for naming the extracted image. + * @param null|string $filename Output filename. + * @param null|string $format Output format. + * + * @return ExtractedImage Extracted image data. + * @throws \ImagickException Throws if the image can't be processed. + */ + public function extractPolygonFromPage( + Polygon $polygon, + int $pageIndex, + int $index, + ?string $filename = null, + ?string $format = null + ): ExtractedImage { + $bbox = BBoxUtils::generateBBoxFromPolygon($polygon); + $extractedImageData = $this->extractImageFromBbox($bbox, $pageIndex); + $filename ??= $this->filename; + $format ??= $this->saveFormat; + $filename ??= sprintf('%s.%s_page%d-%d.%s', $filename, $format, $pageIndex, $index, $format); + + return new ExtractedImage($extractedImageData, $filename, $format, $pageIndex, $index); + } + /** * Extracts a single image from a Position field. * @@ -171,41 +190,43 @@ private function extractFromPage(array $fields, int $pageIndex, string $outputNa * @param integer $pageIndex The page index to extract, begins at 0. * @param integer $index The index to use for naming the extracted image. * @param string $filename The output filename. - * @return ExtractedImage|null The extracted image, or null if the field does not have valid position data. + * @param string $format The output format. + * + * @return null|ExtractedImage The extracted image, or null if the field does not have valid position data. + * * @throws MindeeGeometryException Throws if a field does not contain positional data. */ - public function extractImage(BaseField $field, int $pageIndex, int $index, string $filename): ?ExtractedImage - { - $splitName = $this->splitNameStrict($filename); - $boundingBox = null; + public function extractImage( + BaseField $field, + int $pageIndex, + int $index, + string $filename, + string $format + ): ?ExtractedImage { + $polygon = null; if (!empty($field->polygon)) { - $boundingBox = $field->polygon; + $polygon = $field->polygon; } elseif (!empty($field->boundingBox)) { - $boundingBox = $field->boundingBox; + $polygon = $field->boundingBox; } elseif (!empty($field->quadrangle)) { - $boundingBox = $field->quadrangle; + $polygon = $field->quadrangle; } elseif (!empty($field->rectangle)) { - $boundingBox = $field->rectangle; + $polygon = $field->rectangle; } - if ($boundingBox === null) { + if (null === $polygon) { throw new MindeeGeometryException( - "Provided field has no valid position data.", + 'Provided field has no valid position data.', ErrorCode::GEOMETRIC_OPERATION_FAILED ); } - $bbox = BBoxUtils::generateBBoxFromPolygon($boundingBox); - $fieldFilename = sprintf("%s_%03d.%s", $splitName[0], $index, $this->saveFormat); - $extractedImageData = $this->extractImageFromBbox($bbox, $pageIndex); - - return new ExtractedImage($extractedImageData, $fieldFilename, $this->saveFormat); + return $this->extractPolygonFromPage($polygon, $pageIndex, $index, $filename, $format); } /** * Getter for the local input source. - * * @return LocalInputSource */ public function getInputSource(): LocalInputSource @@ -213,14 +234,43 @@ public function getInputSource(): LocalInputSource return $this->inputSource; } + /** + * Extracts images from a page. + * + * @param array $fields List of Fields to extract. + * @param integer $pageIndex The page index to extract, begins at 0. + * @param string $outputName Name of the created file. + * @param string $format The output format. + * + * @return array an array of created images + */ + protected function extractFromPage(array $fields, int $pageIndex, string $outputName, string $format = 'jpg'): array + { + $format ??= $this->saveFormat; + $extractedImages = []; + + $i = 0; + foreach ($fields as $field) { + $filename = sprintf('%s_page%d-%d.%s', $outputName, $pageIndex, $i, $format); + $extractedImage = $this->extractImage($field, $pageIndex, $i, $filename, $format); + if (null !== $extractedImage) { + $extractedImages[] = $extractedImage; + } + ++$i; + } + + return $extractedImages; + } + /** * Extracts an image from a set of coordinates. * - * @param BBox $bbox BBox coordinates. - * @param integer $pageIndex The page index to extract, begins at 0. + * @param BBox $bbox BBox coordinates. + * @param integer|float $pageIndex The page index to extract, begins at 0. * @return \Imagick + * @throws \ImagickException Throws if the image can't be processed. */ - private function extractImageFromBbox(BBox $bbox, int $pageIndex): \Imagick + protected function extractImageFromBbox(BBox $bbox, int|float $pageIndex): \Imagick { $image = $this->pageImages[$pageIndex]->clone(); $width = $image->getImageWidth(); @@ -231,7 +281,7 @@ private function extractImageFromBbox(BBox $bbox, int $pageIndex): \Imagick $minY = round($bbox->getMinY() * $height); $maxY = round($bbox->getMaxY() * $height); - $image->cropImage($maxX - $minX, $maxY - $minY, $minX, $minY); + $image->cropImage((int)($maxX - $minX), (int)($maxY - $minY), (int)$minX, (int)$minY); return $image; } @@ -240,13 +290,13 @@ private function extractImageFromBbox(BBox $bbox, int $pageIndex): \Imagick * Splits the filename into name and extension. * * @param string $filename Name of the file. - * @return array + * @return array An array containing the name and extension of the file. */ - private function splitNameStrict(string $filename): array + protected static function splitNameStrict(string $filename): array { return [ pathinfo($filename, PATHINFO_FILENAME), - pathinfo($filename, PATHINFO_EXTENSION) + pathinfo($filename, PATHINFO_EXTENSION), ]; } } diff --git a/src/Extraction/PdfExtractor.php b/src/Extraction/PdfExtractor.php index 23605f17..d3e061bf 100644 --- a/src/Extraction/PdfExtractor.php +++ b/src/Extraction/PdfExtractor.php @@ -2,12 +2,10 @@ namespace Mindee\Extraction; -use InvalidArgumentException; use Mindee\Error\MindeePDFException; use Mindee\Error\MindeeUnhandledException; use Mindee\Input\LocalInputSource; use Mindee\Parsing\DependencyChecker; -use Mindee\Product\InvoiceSplitter\InvoiceSplitterV1InvoicePageGroup; use Mindee\Product\InvoiceSplitter\InvoiceSplitterV1InvoicePageGroups; use setasign\Fpdi\Fpdi; use setasign\Fpdi\PdfParser\CrossReference\CrossReferenceException; @@ -22,18 +20,20 @@ class PdfExtractor { /** - * @var string Bytes representation of a file. + * @var string bytes representation of a file */ private string $pdfBytes; + /** - * @var string Name of the file. + * @var string name of the file */ private string $fileName; /** - * @param LocalInputSource $localInput Local Input, accepts all compatible formats. - * @throws MindeeUnhandledException|MindeePDFException Throws if PDF operations aren't supported, - * or if the file can't be read, respectively. + * @param LocalInputSource $localInput local Input, accepts all compatible formats + * + * @throws MindeePDFException|MindeeUnhandledException throws if PDF operations aren't supported, + * or if the file can't be read, respectively */ public function __construct(LocalInputSource $localInput) { @@ -58,30 +58,33 @@ public function __construct(LocalInputSource $localInput) /** * Wrapper for pdf GetPageCount(). * - * @return integer The number of pages in the file. - * @throws MindeePDFException Throws if FPDI is unable to process the file. + * @return int the number of pages in the file + * + * @throws MindeePDFException throws if FPDI is unable to process the file */ public function getPageCount(): int { try { - $pdfHandle = new FPDI(); + $pdfHandle = new Fpdi(); $tempFilename = tempnam(sys_get_temp_dir(), 'extracted_pdf_'); file_put_contents($tempFilename, $this->pdfBytes); + return $pdfHandle->setSourceFile($tempFilename); } catch (PdfParserException $e) { throw new MindeePDFException("Couldn't open PDF file. FPDI sent the following: ", 0, $e); } } - /** * Extracts sub-documents from the source document using list of page indexes. * - * @param array $pageIndexes List of sub-lists of pages to keep. - * @return array List of extracted documents. - * @throws MindeePDFException Throws if FDPF/FPDI wasn't able to handle the pdf during the extraction. - * @throws InvalidArgumentException Throws if invalid indexes are provided. + * @param array $pageIndexes list of sub-lists of pages to keep + * + * @return array list of extracted documents + * + * @throws MindeePDFException throws if FDPF/FPDI wasn't able to handle the pdf during the extraction + * @throws \InvalidArgumentException throws if invalid indexes are provided */ public function extractSubDocuments(array $pageIndexes): array { @@ -89,20 +92,21 @@ public function extractSubDocuments(array $pageIndexes): array foreach ($pageIndexes as $pageIndexElem) { if (empty($pageIndexElem)) { - throw new InvalidArgumentException("Empty indexes not allowed for extraction."); + throw new \InvalidArgumentException('Empty indexes not allowed for extraction.'); } $extension = pathinfo($this->fileName, PATHINFO_EXTENSION); $prefix = pathinfo($this->fileName, PATHINFO_FILENAME); $fieldFilename = sprintf( - "%s_%03d-%03d.%s", + '%s_%03d-%03d.%s', $prefix, $pageIndexElem[0] + 1, $pageIndexElem[count($pageIndexElem) - 1] + 1, $extension ); + try { - $pdf = new FPDI(); + $pdf = new Fpdi(); $tempFilename = tempnam(sys_get_temp_dir(), 'extracted_pdf_'); file_put_contents($tempFilename, $this->pdfBytes); $pdf->setSourceFile($tempFilename); @@ -114,11 +118,7 @@ public function extractSubDocuments(array $pageIndexes): array $mergedPdfBytes = $pdf->Output('S'); } catch ( - PdfParserException | - CrossReferenceException | - FilterException | - PdfTypeException | - PdfReaderException $e + CrossReferenceException|FilterException|PdfParserException|PdfReaderException|PdfTypeException $e ) { throw new MindeePDFException("PDF file couldn't be processed during extraction."); } @@ -131,9 +131,10 @@ public function extractSubDocuments(array $pageIndexes): array /** * Extracts invoices as complete PDFs from the document. * - * @param array| InvoiceSplitterV1InvoicePageGroups $pageIndexes List of sub-lists of pages to keep. - * @param boolean $strict Whether to trust confidence scores or not. - * @return array A list of extracted invoices. + * @param array|InvoiceSplitterV1InvoicePageGroups $pageIndexes List of sub-lists of pages to keep + * @param bool $strict whether to trust confidence scores or not + * + * @return array a list of extracted invoices */ public function extractInvoices($pageIndexes, bool $strict = false): array { @@ -141,11 +142,11 @@ public function extractInvoices($pageIndexes, bool $strict = false): array return []; } if (!$strict) { - $indexes = array_map(function ($invoicePageIndexes) { - return $invoicePageIndexes->pageIndexes; - }, (array)$pageIndexes); + $indexes = array_map(fn ($invoicePageIndexes) => $invoicePageIndexes->pageIndexes, (array) $pageIndexes); + return $this->extractSubDocuments($indexes); - } elseif (is_array($pageIndexes[0])) { + } + if (is_array($pageIndexes[0])) { return $this->extractSubDocuments($pageIndexes); } @@ -158,7 +159,7 @@ public function extractInvoices($pageIndexes, bool $strict = false): array $confidence = $pageIndex->confidence; $pageList = $pageIndex->pageIndexes; - if ($confidence >= 0.5 && $previousConfidence === null) { + if ($confidence >= 0.5 && null === $previousConfidence) { $currentList = $pageList; } elseif ($confidence >= 0.5 && $i !== count($pageIndexes) - 1) { if (!empty($currentList)) { @@ -178,13 +179,14 @@ public function extractInvoices($pageIndexes, bool $strict = false): array } $previousConfidence = $confidence; - $i++; + ++$i; } + return $this->extractSubDocuments($correctPageIndexes); } /** - * @return string Name of the file. + * @return string name of the file */ public function getFileName(): string { diff --git a/src/V1/Image/ImageExtractor.php b/src/V1/Image/ImageExtractor.php new file mode 100644 index 00000000..72656282 --- /dev/null +++ b/src/V1/Image/ImageExtractor.php @@ -0,0 +1,12 @@ +localInput = $localInput; + } + + /** + * Extracts a crop zone from a file. + * + * @param CropItem $crop crop to extract + * + * @return ExtractedImage extracted image + */ + public function extractCrop(CropItem $crop): ExtractedImage + { + return $this->extractCrops([$crop])[0]; + } + + /** + * Extracts multiple crop zones from a file. + * + * @param CropItem[] $crops list of crops to extract + */ + public function extractCrops(array $crops): CropFiles + { + $imageExtractor = new ImageExtractor($this->localInput); + $extractedImages = []; + + $cropsPerPage = []; + foreach ($crops as $crop) { + $cropsPerPage[$crop->location->page][] = $crop; + } + + foreach ($cropsPerPage as $page => $pageCrops) { + $polygons = array_map(fn ($c) => $c->location->polygon, $pageCrops); + + $images = $imageExtractor->extractPolygonsFromPage( + $polygons, + $page + ); + array_push($extractedImages, ...$images); + } + + return new CropFiles(...$extractedImages); + } +} diff --git a/src/V2/FileOperations/CropFiles.php b/src/V2/FileOperations/CropFiles.php new file mode 100644 index 00000000..acad094d --- /dev/null +++ b/src/V2/FileOperations/CropFiles.php @@ -0,0 +1,58 @@ + + */ +class CropFiles extends \ArrayObject +{ + /** + * Builds a new CropFiles collection. + * + * @param ExtractedImage ...$items Items. + */ + public function __construct(ExtractedImage ...$items) + { + parent::__construct($items); + } + + /** + * Save all extracted crops to disk. + * + * @param string $path the directory path to save the extracted crops to + * @param string $prefix prefix to add to the filename + * @param null|string $fileFormat file format to save the crops as + * @param int $quality quality of the saved image + * + * @throws MindeeException if directory creation fails + */ + public function saveAllToDisk( + string $path, + string $prefix = 'crop', + ?string $fileFormat = null, + int $quality = 100 + ): void { + $format ??= $fileFormat; + $idx = 1; + + foreach ($this as $crop) { + $formattedIdx = sprintf('%03d', $idx); + $filename = sprintf('%s_%s.jpg', $prefix, $formattedIdx); + $crop->filename = $filename; + + try { + $crop->writeToFile($path, $format, $quality); + } catch (\ImagickException $e) { + throw new MindeeException('Failed to save crop to disk.', 0, $e); + } + + ++$idx; + } + } +} diff --git a/src/V2/FileOperations/Split.php b/src/V2/FileOperations/Split.php new file mode 100644 index 00000000..8ad0bd8c --- /dev/null +++ b/src/V2/FileOperations/Split.php @@ -0,0 +1,73 @@ +localInput = $inputSource; + } + + /** + * Expands a range to a list of integers. + * + * @param int $start start of the range + * @param int $end end of the range + * + * @return int[] + * + * @throws MindeeInputException if the start page is greater than the end page + */ + public static function expandRange(int $start, int $end): array + { + if ($start > $end || $start < 0) { + throw new MindeeInputException('Invalid page range provided.'); + } + + return range($start, $end); + } + + /** + * Extracts a single split from the input file. + * + * @param int[] $split split range to extract + * + * @return ExtractedPdf 2D array of extracted pages + */ + public function extractSingleSplit(array $split): ExtractedPdf + { + return $this->extractSplits([$split])[0]; + } + + /** + * Extracts the splits from the input file. + * + * @param int[][] $splits list of split ranges to extract + * + * @return SplitFiles list of extracted files + */ + public function extractSplits(array $splits): SplitFiles + { + $pdfExtractor = new PdfExtractor($this->localInput); + $expandedPageIndexes = array_map(fn (array $split) => self::expandRange($split[0], $split[1]), $splits); + + return new SplitFiles(...$pdfExtractor->extractSubDocuments($expandedPageIndexes)); + } +} diff --git a/src/V2/FileOperations/SplitFiles.php b/src/V2/FileOperations/SplitFiles.php new file mode 100644 index 00000000..f6ca8e4f --- /dev/null +++ b/src/V2/FileOperations/SplitFiles.php @@ -0,0 +1,57 @@ + + */ +class SplitFiles extends \ArrayObject +{ + /** + * Builds a new SplitFiles collection. + * + * @param ExtractedPdf ...$items Items. + */ + public function __construct(ExtractedPdf ...$items) + { + parent::__construct($items); + } + + /** + * Save all extracted splits to disk. + * + * @param string $path the directory path to save the extracted splits to + * @param string $prefix prefix to add to the filename + * + * @throws MindeeException if directory creation fails + */ + public function saveAllToDisk(string $path, string $prefix = 'split'): void + { + if (!is_dir($path)) { + if (!mkdir($path, 0o777, true) && !is_dir($path)) { + throw new MindeeException(sprintf('Directory "%s" was not created', $path)); + } + } + + $idx = 1; + + foreach ($this as $split) { + $formattedIdx = sprintf('%03d', $idx); + $filename = sprintf('%s_%s.pdf', $prefix, $formattedIdx); + $filePath = rtrim($path, DIRECTORY_SEPARATOR) . DIRECTORY_SEPARATOR . $filename; + + try { + $split->writeToFile($filePath); + } catch (\Exception $e) { + throw new MindeeException('Failed to save split to disk.', 0, $e->getMessage()); + } + + ++$idx; + } + } +} diff --git a/tests/Dependencies/DependencyCheckerNoExtendedTestPdf.php b/tests/Dependencies/DependencyCheckerNoExtendedTestPdf.php index a82ba53c..766d8016 100644 --- a/tests/Dependencies/DependencyCheckerNoExtendedTestPdf.php +++ b/tests/Dependencies/DependencyCheckerNoExtendedTestPdf.php @@ -5,7 +5,7 @@ use Mindee\Error\MindeeUnhandledException; use Mindee\Extraction\ExtractedImage; use Mindee\Extraction\ExtractedPdf; -use Mindee\Extraction\ImageExtractor; +use Mindee\V1\Image\ImageExtractor; use Mindee\Extraction\PdfExtractor; use Mindee\Input\PathInput; use PHPUnit\Framework\TestCase; @@ -31,8 +31,8 @@ public function testNoExtractedImage() $this->expectException(MindeeUnhandledException::class); $inputImage = ""; $filename = "dummy"; - $saveFormat = "pdf;"; - new ExtractedImage($inputImage, $filename, $saveFormat); + $saveFormat = "pdf"; + new ExtractedImage($inputImage, $filename, $saveFormat, 0, 0); } public function testNoExtractedPdf() { diff --git a/tests/V1/Extraction/ImageExtractorTest.php b/tests/V1/Extraction/ImageExtractorTest.php index a0aa503f..15c25af3 100644 --- a/tests/V1/Extraction/ImageExtractorTest.php +++ b/tests/V1/Extraction/ImageExtractorTest.php @@ -3,7 +3,7 @@ namespace V1\Extraction; use Mindee\Client; -use Mindee\Extraction\ImageExtractor; +use Mindee\V1\Image\ImageExtractor; use Mindee\Input\LocalResponse; use Mindee\Input\PathInput; use Mindee\Product\BarcodeReader\BarcodeReaderV1; @@ -39,7 +39,7 @@ public function testGivenAnImageShouldExtractPositionFields() $source = $extractedImage->asInputSource(); $this->assertEquals( - sprintf("default_sample_page-001_%03d.jpg", $i + 1), + sprintf("default_sample.jpg_page0-%d.jpg", $i), $source->fileName ); } @@ -68,7 +68,7 @@ public function testGivenAnImageShouldExtractValueFields() $this->assertNotNull($extractedImage->image); $source = $extractedImage->asInputSource(); $this->assertEquals( - sprintf("barcodes_1D_page-001_%03d.jpg", $i + 1), + sprintf("barcodes_1D.jpg_page0-%d.jpg", $i), $source->fileName ); $extractedImage->writeToFile(\TestingUtilities::getRootDataDir() . "/output"); @@ -103,7 +103,7 @@ public function testGivenAPdfShouldExtractPositionFields() $source = $extractedImage->asInputSource(); $this->assertEquals( - sprintf("multipage_sample_page-%03d_%03d.jpg", $page->id + 1, $i + 1), + sprintf("multipage_sample.pdf_page%d-%d.jpg", $page->id, $i), $source->fileName ); } diff --git a/tests/V2/FileOperations/CropFunctional.php b/tests/V2/FileOperations/CropFunctional.php new file mode 100644 index 00000000..6c8c9051 --- /dev/null +++ b/tests/V2/FileOperations/CropFunctional.php @@ -0,0 +1,104 @@ +client = new ClientV2($apiKey); + $this->cropModelId = getenv('MINDEE_V2_CROP_MODEL_ID') ?: ''; + $this->findocModelId = getenv('MINDEE_V2_FINDOC_MODEL_ID') ?: ''; + + $this->outputDir = getcwd() . '/output'; + if (!is_dir($this->outputDir)) { + mkdir($this->outputDir, 0777, true); + } + } + + protected function tearDown(): void + { + $file1 = $this->outputDir . '/crop_001.jpg'; + $file2 = $this->outputDir . '/crop_002.jpg'; + + if (file_exists($file1)) { + unlink($file1); + } + if (file_exists($file2)) { + unlink($file2); + } + } + + private function checkFindocReturn(InferenceResponse $findocResponse): void + { + $this->assertGreaterThan(0, strlen($findocResponse->inference->model->id)); + + $totalAmount = $findocResponse->inference->result->fields['total_amount']; + $this->assertNotNull($totalAmount); + $this->assertGreaterThan(0, $totalAmount->value); + } + + public function testExtractCropsFromImageCorrectly(): void + { + $inputSource = new PathInput(\TestingUtilities::getV2ProductDir() . '/crop/default_sample.jpg'); + $cropParams = new CropParameters($this->cropModelId); + + $response = $this->client->enqueueAndGetResult(CropResponse::class, $inputSource, $cropParams); + + $this->assertNotNull($response); + $this->assertCount(2, $response->inference->result->crops); + + $cropOperation = new Crop($inputSource); + $extractedImages = $cropOperation->extractCrops($response->inference->result->crops); + + $this->assertCount(2, $extractedImages); + $this->assertEquals('default_sample.jpg_page0-0.jpg', $extractedImages[0]->filename); + $this->assertEquals('default_sample.jpg_page0-1.jpg', $extractedImages[1]->filename); + + $extractionInput = $extractedImages[0]->asInputSource(); + $findocParams = new InferenceParameters($this->findocModelId); + + $invoice0 = $this->client->enqueueAndGetResult(InferenceResponse::class, $extractionInput, $findocParams); + + $this->checkFindocReturn($invoice0); + + $extractedImages->saveAllToDisk($this->outputDir, quality: 50); + + $file1Info = filesize($this->outputDir . '/crop_001.jpg'); + $this->assertGreaterThanOrEqual(99000, $file1Info); + $this->assertLessThanOrEqual(110000, $file1Info); + + $file2Info = filesize($this->outputDir . '/crop_002.jpg'); + $this->assertGreaterThanOrEqual(99000, $file2Info); + $this->assertLessThanOrEqual(110000, $file2Info); + } + + public function testExtractCropsFromEachPdfPageCorrectly(): void + { + $inputSource = new PathInput(\TestingUtilities::getV2ProductDir() . '/crop/multipage_sample.pdf'); + $cropParams = new CropParameters($this->cropModelId); + + $response = $this->client->enqueueAndGetResult(CropResponse::class, $inputSource, $cropParams); + $cropOperation = new Crop($inputSource); + $extractedImages = $cropOperation->extractCrops($response->inference->result->crops); + + $this->assertCount(5, $extractedImages); + $this->assertEquals('multipage_sample.pdf_page0-0.jpg', $extractedImages[0]->filename); + $this->assertEquals('multipage_sample.pdf_page1-0.jpg', $extractedImages[3]->filename); + } +} diff --git a/tests/V2/FileOperations/CropTest.php b/tests/V2/FileOperations/CropTest.php new file mode 100644 index 00000000..51ddb48c --- /dev/null +++ b/tests/V2/FileOperations/CropTest.php @@ -0,0 +1,68 @@ +cropDataDir = \TestingUtilities::getV2DataDir() . '/products/crop'; + } + + public function testProcessesSinglePageCropSplitCorrectly(): void + { + $inputSample = new PathInput($this->cropDataDir . '/default_sample.jpg'); + + $localResponse = new LocalResponse($this->cropDataDir . '/crop_single.json'); + $doc = $localResponse->deserializeResponse(CropResponse::class); + + $cropOperation = new Crop($inputSample); + $extractedCrops = $cropOperation->extractCrops($doc->inference->result->crops); + + $this->assertCount(1, $extractedCrops); + + $this->assertEquals(0, $extractedCrops[0]->pageId); + $this->assertEquals(0, $extractedCrops[0]->elementId); + + $bitmap0 = $extractedCrops[0]->image; + + $this->assertEquals(2822, $bitmap0->width ?? clone $bitmap0->getWidth()); + $this->assertEquals(1572, $bitmap0->height ?? clone $bitmap0->getHeight()); + } + + public function testProcessesMultiPageReceiptSplitCorrectly(): void + { + $inputSample = new PathInput($this->cropDataDir . '/multipage_sample.pdf'); + + $localResponse = new LocalResponse($this->cropDataDir . '/crop_multiple.json'); + $doc = $localResponse->deserializeResponse(CropResponse::class); + + $cropOperation = new Crop($inputSample); + $extractedCrops = $cropOperation->extractCrops($doc->inference->result->crops); + + $this->assertCount(2, $extractedCrops); + + $this->assertEquals(0, $extractedCrops[0]->pageId); + $this->assertEquals(0, $extractedCrops[0]->elementId); + + $bitmap0 = $extractedCrops[0]->image; + $this->assertEquals(156, $bitmap0->width ?? $bitmap0->getWidth()); + $this->assertEquals(757, $bitmap0->height ?? $bitmap0->getHeight()); + + $this->assertEquals(0, $extractedCrops[1]->pageId); + $this->assertEquals(1, $extractedCrops[1]->elementId); + + $bitmap1 = $extractedCrops[1]->image; + $this->assertEquals(188, $bitmap1->width ?? $bitmap1->getWidth()); + $this->assertEquals(691, $bitmap1->height ?? $bitmap1->getHeight()); + } +} diff --git a/tests/V2/FileOperations/SplitFunctional.php b/tests/V2/FileOperations/SplitFunctional.php new file mode 100644 index 00000000..dd12191d --- /dev/null +++ b/tests/V2/FileOperations/SplitFunctional.php @@ -0,0 +1,95 @@ +client = new ClientV2($apiKey); + $this->splitModelId = getenv('MINDEE_V2_SPLIT_MODEL_ID') ?: ''; + $this->findocModelId = getenv('MINDEE_V2_FINDOC_MODEL_ID') ?: ''; + + $this->outputDir = getcwd() . '/output'; + if (!is_dir($this->outputDir)) { + mkdir($this->outputDir, 0777, true); + } + } + + protected function tearDown(): void + { + $file1 = $this->outputDir . '/split_001.pdf'; + $file2 = $this->outputDir . '/split_002.pdf'; + + if (file_exists($file1)) { + unlink($file1); + } + if (file_exists($file2)) { + unlink($file2); + } + } + + private function checkFindocReturn(InferenceResponse $findocResponse): void + { + $this->assertGreaterThan(0, strlen($findocResponse->inference->model->id)); + + $totalAmount = $findocResponse->inference->result->fields['total_amount']; + $this->assertNotNull($totalAmount); + $this->assertGreaterThan(0, $totalAmount->value); + } + + public function testExtractSplitsFromPdfCorrectly(): void + { + $inputSource = new PathInput(\TestingUtilities::getV2ProductDir() . '/split/default_sample.pdf'); + $splitParams = new SplitParameters($this->splitModelId); + + $response = $this->client->enqueueAndGetResult(SplitResponse::class, $inputSource, $splitParams); + + $this->assertNotNull($response); + $this->assertCount(2, $response->inference->result->splits); + + $splitOperation = new Split($inputSource); + $extractedSplits = $splitOperation->extractSplits( + array_map(fn($s) => $s->pageRange, $response->inference->result->splits) + ); + + $this->assertCount(2, $extractedSplits); + $this->assertEquals('default_sample_001-001.pdf', $extractedSplits[0]->filename); + $this->assertEquals('default_sample_002-002.pdf', $extractedSplits[1]->filename); + + $inferenceInput = $extractedSplits[0]->asInputSource(); + $findocParams = new InferenceParameters($this->findocModelId); + + $invoice0 = $this->client->enqueueAndGetResult(InferenceResponse::class, $inferenceInput, $findocParams); + + $this->checkFindocReturn($invoice0); + + $extractedSplits->saveAllToDisk($this->outputDir); + + for ($i = 0; $i < count($extractedSplits); $i++) { + $fileName = sprintf('split_%03d.pdf', $i + 1); + $filePath = $this->outputDir . '/' . $fileName; + + $this->assertFileExists($filePath); + $this->assertGreaterThan(0, filesize($filePath)); + + $localInput = new PathInput($filePath); + $this->assertEquals($extractedSplits[$i]->getPageCount(), $localInput->getPageCount()); + } + } +} \ No newline at end of file diff --git a/tests/V2/FileOperations/SplitTest.php b/tests/V2/FileOperations/SplitTest.php new file mode 100644 index 00000000..97f8e833 --- /dev/null +++ b/tests/V2/FileOperations/SplitTest.php @@ -0,0 +1,55 @@ +splitDataDir = \TestingUtilities::getV2DataDir() . '/products/split'; + $this->finDocDataDir = \TestingUtilities::getV2DataDir() . '/products/extraction/financial_document'; + } + + public function testProcessesSinglePageSplitCorrectly(): void + { + $inputSample = new PathInput($this->finDocDataDir . '/default_sample.jpg'); + + $localResponse = new LocalResponse($this->splitDataDir . '/split_single.json'); + $doc = $localResponse->deserializeResponse(SplitResponse::class); + + $splitOperation = new Split($inputSample); + $splits = $doc->inference->result->splits; + $extractedSplits = $splitOperation->extractSplits(array_map(fn($s) => $s->pageRange, $splits)); + + $this->assertCount(1, $extractedSplits); + + $this->assertEquals(1, $extractedSplits[0]->getPageCount()); + } + + public function testProcessesMultiPageReceiptSplitCorrectly(): void + { + $inputSample = new PathInput($this->splitDataDir . '/invoice_5p.pdf'); + + $localResponse = new LocalResponse($this->splitDataDir . '/split_multiple.json'); + $doc = $localResponse->deserializeResponse(SplitResponse::class); + + $splitOperation = new Split($inputSample); + $splits = $doc->inference->result->splits; + $extractedSplits = $splitOperation->extractSplits(array_map(fn($s) => $s->pageRange, $splits)); + + $this->assertCount(3, $extractedSplits); + + $this->assertEquals(1, $extractedSplits[0]->getPageCount()); + $this->assertEquals(3, $extractedSplits[1]->getPageCount()); + $this->assertEquals(1, $extractedSplits[2]->getPageCount()); + } +} From ca35a12f4278e17f91f9473f1c9cbd2342509d73 Mon Sep 17 00:00:00 2001 From: sebastianMindee <130448732+sebastianMindee@users.noreply.github.com> Date: Tue, 21 Apr 2026 14:10:11 +0200 Subject: [PATCH 2/4] fix lint --- src/Extraction/ExtractedPdf.php | 17 ++++++------- src/Extraction/PdfExtractor.php | 37 +++++++++++++++-------------- src/V2/FileOperations/CropFiles.php | 11 +++++---- src/V2/FileOperations/Split.php | 12 +++++----- 4 files changed, 40 insertions(+), 37 deletions(-) diff --git a/src/Extraction/ExtractedPdf.php b/src/Extraction/ExtractedPdf.php index bffca188..860d642c 100644 --- a/src/Extraction/ExtractedPdf.php +++ b/src/Extraction/ExtractedPdf.php @@ -21,17 +21,17 @@ class ExtractedPdf public string $filename; /** - * File object for an ExtractedPdf. + * @var string File object for an ExtractedPdf. */ protected string $pdfBytes; /** * Initializes a new instance of the ExtractedPdf class. * - * @param string $pdfBytes a binary string representation of the PDF - * @param string $filename name of the original file + * @param string $pdfBytes A binary string representation of the PDF. + * @param string $filename Name of the original file. * - * @throws MindeeUnhandledException throws if PDF operations aren't supported + * @throws MindeeUnhandledException Throws if PDF operations aren't supported. */ public function __construct(string $pdfBytes, string $filename) { @@ -44,9 +44,9 @@ public function __construct(string $pdfBytes, string $filename) /** * Wrapper for pdf GetPageCount(). * - * @return int the number of pages in the file + * @return integer the number of pages in the file * - * @throws MindeePDFException throws if FPDI is unable to process the file + * @throws MindeePDFException Throws if FPDI is unable to process the file. */ public function getPageCount(): int { @@ -69,11 +69,12 @@ public function getPageCount(): int /** * Write the PDF to a file. * - * @param string $outputPath the output directory (must exist) + * @param string $outputPath The output directory (must exist). + * @return void */ public function writeToFile(string $outputPath): void { - $pdfPath = $outputPath.DIRECTORY_SEPARATOR.$this->filename; + $pdfPath = $outputPath . DIRECTORY_SEPARATOR . $this->filename; if ('' !== basename($outputPath)) { if (!($pdfPath = realpath($outputPath))) { $pdfPath = $outputPath; diff --git a/src/Extraction/PdfExtractor.php b/src/Extraction/PdfExtractor.php index d3e061bf..36bac699 100644 --- a/src/Extraction/PdfExtractor.php +++ b/src/Extraction/PdfExtractor.php @@ -3,7 +3,6 @@ namespace Mindee\Extraction; use Mindee\Error\MindeePDFException; -use Mindee\Error\MindeeUnhandledException; use Mindee\Input\LocalInputSource; use Mindee\Parsing\DependencyChecker; use Mindee\Product\InvoiceSplitter\InvoiceSplitterV1InvoicePageGroups; @@ -11,7 +10,6 @@ use setasign\Fpdi\PdfParser\CrossReference\CrossReferenceException; use setasign\Fpdi\PdfParser\Filter\FilterException; use setasign\Fpdi\PdfParser\PdfParserException; -use setasign\Fpdi\PdfParser\Type\PdfTypeException; use setasign\Fpdi\PdfReader\PdfReaderException; /** @@ -30,10 +28,10 @@ class PdfExtractor private string $fileName; /** - * @param LocalInputSource $localInput local Input, accepts all compatible formats + * @param LocalInputSource $localInput Local Input, accepts all compatible formats. * - * @throws MindeePDFException|MindeeUnhandledException throws if PDF operations aren't supported, - * or if the file can't be read, respectively + * @throws MindeePDFException Throws if PDF operations aren't supported, or if the file + * can't be read, respectively. */ public function __construct(LocalInputSource $localInput) { @@ -58,9 +56,9 @@ public function __construct(LocalInputSource $localInput) /** * Wrapper for pdf GetPageCount(). * - * @return int the number of pages in the file + * @return integer The number of pages in the file. * - * @throws MindeePDFException throws if FPDI is unable to process the file + * @throws MindeePDFException Throws if FPDI is unable to process the file. */ public function getPageCount(): int { @@ -79,14 +77,14 @@ public function getPageCount(): int /** * Extracts sub-documents from the source document using list of page indexes. * - * @param array $pageIndexes list of sub-lists of pages to keep + * @param array|InvoiceSplitterV1InvoicePageGroups $pageIndexes List of sub-lists of pages to keep. * - * @return array list of extracted documents + * @return ExtractedPdf[] list of extracted documents * - * @throws MindeePDFException throws if FDPF/FPDI wasn't able to handle the pdf during the extraction - * @throws \InvalidArgumentException throws if invalid indexes are provided + * @throws MindeePDFException Throws if FDPF/FPDI wasn't able to handle the pdf during the extraction. + * @throws \InvalidArgumentException Throws if invalid indexes are provided. */ - public function extractSubDocuments(array $pageIndexes): array + public function extractSubDocuments(mixed $pageIndexes): array { $extractedPdfs = []; @@ -118,9 +116,12 @@ public function extractSubDocuments(array $pageIndexes): array $mergedPdfBytes = $pdf->Output('S'); } catch ( - CrossReferenceException|FilterException|PdfParserException|PdfReaderException|PdfTypeException $e + CrossReferenceException | + FilterException | + PdfParserException | + PdfReaderException $e ) { - throw new MindeePDFException("PDF file couldn't be processed during extraction."); + throw new MindeePDFException("PDF file couldn't be processed during extraction.", 0, $e); } $extractedPdfs[] = new ExtractedPdf($mergedPdfBytes, $fieldFilename); } @@ -131,12 +132,12 @@ public function extractSubDocuments(array $pageIndexes): array /** * Extracts invoices as complete PDFs from the document. * - * @param array|InvoiceSplitterV1InvoicePageGroups $pageIndexes List of sub-lists of pages to keep - * @param bool $strict whether to trust confidence scores or not + * @param array|InvoiceSplitterV1InvoicePageGroups $pageIndexes List of sub-lists of pages to keep. + * @param boolean $strict Whether to trust confidence scores or not. * - * @return array a list of extracted invoices + * @return ExtractedPdf[] a list of extracted invoices */ - public function extractInvoices($pageIndexes, bool $strict = false): array + public function extractInvoices(mixed $pageIndexes, bool $strict = false): array { if (empty($pageIndexes)) { return []; diff --git a/src/V2/FileOperations/CropFiles.php b/src/V2/FileOperations/CropFiles.php index acad094d..14c5999f 100644 --- a/src/V2/FileOperations/CropFiles.php +++ b/src/V2/FileOperations/CropFiles.php @@ -25,12 +25,13 @@ public function __construct(ExtractedImage ...$items) /** * Save all extracted crops to disk. * - * @param string $path the directory path to save the extracted crops to - * @param string $prefix prefix to add to the filename - * @param null|string $fileFormat file format to save the crops as - * @param int $quality quality of the saved image + * @param string $path The directory path to save the extracted crops to. + * @param string $prefix Prefix to add to the filename. + * @param null|string $fileFormat File format to save the crops as. + * @param integer $quality Quality of the saved image. * - * @throws MindeeException if directory creation fails + * @throws MindeeException If directory creation fails. + * @return void */ public function saveAllToDisk( string $path, diff --git a/src/V2/FileOperations/Split.php b/src/V2/FileOperations/Split.php index 8ad0bd8c..43a076e2 100644 --- a/src/V2/FileOperations/Split.php +++ b/src/V2/FileOperations/Split.php @@ -18,7 +18,7 @@ class Split private readonly LocalInputSource $localInput; /** - * @param LocalInputSource $inputSource localInputSource object + * @param LocalInputSource $inputSource LocalInputSource object. */ public function __construct(LocalInputSource $inputSource) { @@ -28,12 +28,12 @@ public function __construct(LocalInputSource $inputSource) /** * Expands a range to a list of integers. * - * @param int $start start of the range - * @param int $end end of the range + * @param integer $start Start of the range. + * @param integer $end End of the range. * * @return int[] * - * @throws MindeeInputException if the start page is greater than the end page + * @throws MindeeInputException If the start page is greater than the end page. */ public static function expandRange(int $start, int $end): array { @@ -47,7 +47,7 @@ public static function expandRange(int $start, int $end): array /** * Extracts a single split from the input file. * - * @param int[] $split split range to extract + * @param int[] $split Split range to extract. * * @return ExtractedPdf 2D array of extracted pages */ @@ -59,7 +59,7 @@ public function extractSingleSplit(array $split): ExtractedPdf /** * Extracts the splits from the input file. * - * @param int[][] $splits list of split ranges to extract + * @param int[][] $splits List of split ranges to extract. * * @return SplitFiles list of extracted files */ From 21d8a6d5cbbdda9417c45a25458fceda4836b533 Mon Sep 17 00:00:00 2001 From: sebastianMindee <130448732+sebastianMindee@users.noreply.github.com> Date: Tue, 21 Apr 2026 15:37:18 +0200 Subject: [PATCH 3/4] fix lint --- src/V2/FileOperations/Crop.php | 7 ++++--- src/V2/FileOperations/SplitFiles.php | 7 ++++--- 2 files changed, 8 insertions(+), 6 deletions(-) diff --git a/src/V2/FileOperations/Crop.php b/src/V2/FileOperations/Crop.php index 5c637c0e..9c9ee971 100644 --- a/src/V2/FileOperations/Crop.php +++ b/src/V2/FileOperations/Crop.php @@ -18,7 +18,7 @@ class Crop private readonly LocalInputSource $localInput; /** - * @param LocalInputSource $localInput localInputSource object + * @param LocalInputSource $localInput LocalInputSource object. */ public function __construct(LocalInputSource $localInput) { @@ -28,7 +28,7 @@ public function __construct(LocalInputSource $localInput) /** * Extracts a crop zone from a file. * - * @param CropItem $crop crop to extract + * @param CropItem $crop Crop to extract. * * @return ExtractedImage extracted image */ @@ -40,7 +40,8 @@ public function extractCrop(CropItem $crop): ExtractedImage /** * Extracts multiple crop zones from a file. * - * @param CropItem[] $crops list of crops to extract + * @param CropItem[] $crops List of crops to extract. + * @return CropFiles list of extracted files */ public function extractCrops(array $crops): CropFiles { diff --git a/src/V2/FileOperations/SplitFiles.php b/src/V2/FileOperations/SplitFiles.php index f6ca8e4f..b9fefd66 100644 --- a/src/V2/FileOperations/SplitFiles.php +++ b/src/V2/FileOperations/SplitFiles.php @@ -25,10 +25,11 @@ public function __construct(ExtractedPdf ...$items) /** * Save all extracted splits to disk. * - * @param string $path the directory path to save the extracted splits to - * @param string $prefix prefix to add to the filename + * @param string $path The directory path to save the extracted splits to. + * @param string $prefix Prefix to add to the filename. * - * @throws MindeeException if directory creation fails + * @throws MindeeException If directory creation fails. + * @return void */ public function saveAllToDisk(string $path, string $prefix = 'split'): void { From 15bcb315b1a0ee75478216be104d4f48a9266ed9 Mon Sep 17 00:00:00 2001 From: sebastianMindee <130448732+sebastianMindee@users.noreply.github.com> Date: Tue, 21 Apr 2026 17:13:58 +0200 Subject: [PATCH 4/4] fix naming --- src/Extraction/ImageExtractor.php | 41 ++++++++++++++++------ src/V2/FileOperations/Crop.php | 4 ++- tests/V2/FileOperations/CropFunctional.php | 4 +-- 3 files changed, 36 insertions(+), 13 deletions(-) diff --git a/src/Extraction/ImageExtractor.php b/src/Extraction/ImageExtractor.php index a1a39d23..16bc580a 100644 --- a/src/Extraction/ImageExtractor.php +++ b/src/Extraction/ImageExtractor.php @@ -137,19 +137,37 @@ public function extractImagesFromPage(array $fields, int $pageIndex, ?string $ou /** * Extracts images from a page. * - * @param array $polygons List of polygons to extract. - * @param integer $pageIndex The page index to extract, begins at 0. - * @param null|string $format Save format for extracted images. Defaults to the original format. + * @param array $polygons List of polygons to extract. + * @param integer $pageIndex The page index to extract, begins at 0. + * @param null|string $filenamePrefix Output filename prefix. + * @param null|string $format Save format for extracted images. Defaults to the original format. * * @return array an array of created images + * @throws MindeeImageException Throws if the image can't be processed. */ - public function extractPolygonsFromPage(array $polygons, int $pageIndex, ?string $format = null): array - { + public function extractPolygonsFromPage( + array $polygons, + int $pageIndex, + ?string $filenamePrefix = null, + ?string $format = null + ): array { $saveFormat = $format ?? $this->saveFormat; $extractedImages = []; - foreach ($polygons as $i => $polygon) { - $extractedImages[] = $this->extractPolygonFromPage($polygon, $pageIndex, $i, null, $format); + try { + foreach ($polygons as $i => $polygon) { + $filenamePrefix ??= $this->filename; + $outputFilename = sprintf('%s-%d.%s', $filenamePrefix, $i, $saveFormat); + $extractedImages[] = $this->extractPolygonFromPage( + $polygon, + $pageIndex, + $i, + $outputFilename, + $saveFormat + ); + } + } catch (\ImagickException $e) { + throw new MindeeImageException($e->getMessage(), $e->getCode(), $e); } return $extractedImages; @@ -165,7 +183,7 @@ public function extractPolygonsFromPage(array $polygons, int $pageIndex, ?string * @param null|string $format Output format. * * @return ExtractedImage Extracted image data. - * @throws \ImagickException Throws if the image can't be processed. + * @throws MindeeImageException Throws if the image can't be processed. */ public function extractPolygonFromPage( Polygon $polygon, @@ -175,11 +193,14 @@ public function extractPolygonFromPage( ?string $format = null ): ExtractedImage { $bbox = BBoxUtils::generateBBoxFromPolygon($polygon); - $extractedImageData = $this->extractImageFromBbox($bbox, $pageIndex); + try { + $extractedImageData = $this->extractImageFromBbox($bbox, $pageIndex); + } catch (\ImagickException $e) { + throw new MindeeImageException($e->getMessage(), $e->getCode(), $e); + } $filename ??= $this->filename; $format ??= $this->saveFormat; $filename ??= sprintf('%s.%s_page%d-%d.%s', $filename, $format, $pageIndex, $index, $format); - return new ExtractedImage($extractedImageData, $filename, $format, $pageIndex, $index); } diff --git a/src/V2/FileOperations/Crop.php b/src/V2/FileOperations/Crop.php index 9c9ee971..56dc6d4f 100644 --- a/src/V2/FileOperations/Crop.php +++ b/src/V2/FileOperations/Crop.php @@ -55,10 +55,12 @@ public function extractCrops(array $crops): CropFiles foreach ($cropsPerPage as $page => $pageCrops) { $polygons = array_map(fn ($c) => $c->location->polygon, $pageCrops); + $filenamePrefix = sprintf('%s_page%d', $this->localInput->fileName, $page); $images = $imageExtractor->extractPolygonsFromPage( $polygons, - $page + $page, + $filenamePrefix ); array_push($extractedImages, ...$images); } diff --git a/tests/V2/FileOperations/CropFunctional.php b/tests/V2/FileOperations/CropFunctional.php index 6c8c9051..93878fb3 100644 --- a/tests/V2/FileOperations/CropFunctional.php +++ b/tests/V2/FileOperations/CropFunctional.php @@ -80,11 +80,11 @@ public function testExtractCropsFromImageCorrectly(): void $extractedImages->saveAllToDisk($this->outputDir, quality: 50); $file1Info = filesize($this->outputDir . '/crop_001.jpg'); - $this->assertGreaterThanOrEqual(99000, $file1Info); + $this->assertGreaterThanOrEqual(98000, $file1Info); $this->assertLessThanOrEqual(110000, $file1Info); $file2Info = filesize($this->outputDir . '/crop_002.jpg'); - $this->assertGreaterThanOrEqual(99000, $file2Info); + $this->assertGreaterThanOrEqual(98000, $file2Info); $this->assertLessThanOrEqual(110000, $file2Info); }