webmoney/backend/app/Services/Import/PdfParser.php

195 lines
5.5 KiB
PHP

<?php
namespace App\Services\Import;
use Smalot\PdfParser\Parser as PdfParserLib;
class PdfParser implements FileParserInterface
{
protected static array $supportedExtensions = ['pdf'];
/**
* Parse PDF file - extrai texto e tenta identificar tabelas
*/
public function parse(string $filePath, array $options = []): array
{
$headerRow = $options['header_row'] ?? 0;
$dataStartRow = $options['data_start_row'] ?? 1;
// Extrair texto do PDF
$lines = $this->extractLines($filePath);
// Tentar identificar estrutura tabular
$parsed = $this->parseTableStructure($lines, $headerRow, $dataStartRow);
return $parsed;
}
/**
* Get headers from PDF
*/
public function getHeaders(string $filePath, array $options = []): array
{
$headerRow = $options['header_row'] ?? 0;
$lines = $this->extractLines($filePath);
if (isset($lines[$headerRow])) {
return $this->parseLine($lines[$headerRow]);
}
return [];
}
/**
* Get preview data
*/
public function getPreview(string $filePath, int $rows = 10, array $options = []): array
{
$lines = $this->extractLines($filePath);
$preview = [];
$count = 0;
foreach ($lines as $index => $line) {
if ($count >= $rows) {
break;
}
$parsed = $this->parseLine($line);
if (!empty($parsed)) {
$preview[] = [
'row_index' => $index,
'data' => $parsed,
'raw' => $line,
];
$count++;
}
}
return [
'preview' => $preview,
'total_rows' => count($lines),
'columns_count' => !empty($preview) ? count($preview[0]['data']) : 0,
'raw_text_available' => true,
];
}
/**
* Extract lines from PDF
*/
protected function extractLines(string $filePath): array
{
// Verificar se a biblioteca está disponível
if (!class_exists(PdfParserLib::class)) {
// Tentar usar pdftotext (poppler-utils)
return $this->extractWithPdftotext($filePath);
}
try {
$parser = new PdfParserLib();
$pdf = $parser->parseFile($filePath);
$text = $pdf->getText();
// Dividir em linhas
$lines = explode("\n", $text);
// Limpar linhas vazias
$lines = array_filter($lines, fn($line) => trim($line) !== '');
return array_values($lines);
} catch (\Exception $e) {
return $this->extractWithPdftotext($filePath);
}
}
/**
* Extract using pdftotext command
*/
protected function extractWithPdftotext(string $filePath): array
{
$output = [];
$returnVar = 0;
// Tentar com layout preservado
exec("pdftotext -layout " . escapeshellarg($filePath) . " - 2>/dev/null", $output, $returnVar);
if ($returnVar !== 0 || empty($output)) {
// Tentar sem layout
exec("pdftotext " . escapeshellarg($filePath) . " - 2>/dev/null", $output, $returnVar);
}
if ($returnVar !== 0) {
throw new \RuntimeException("Could not extract text from PDF. Please install poppler-utils or smalot/pdfparser.");
}
// Filtrar linhas vazias
return array_values(array_filter($output, fn($line) => trim($line) !== ''));
}
/**
* Parse a single line into columns
*/
protected function parseLine(string $line): array
{
// Tentar dividir por múltiplos espaços
$parts = preg_split('/\s{2,}/', trim($line));
if (count($parts) > 1) {
return array_map('trim', $parts);
}
// Se não funcionou, tentar por tabs
$parts = explode("\t", $line);
if (count($parts) > 1) {
return array_map('trim', $parts);
}
// Retornar linha como único elemento
return [trim($line)];
}
/**
* Parse table structure from lines
*/
protected function parseTableStructure(array $lines, int $headerRow, int $dataStartRow): array
{
$headers = [];
$data = [];
foreach ($lines as $index => $line) {
$parsed = $this->parseLine($line);
if ($index === $headerRow && !empty($parsed)) {
$headers = $parsed;
continue;
}
if ($index >= $dataStartRow && !empty($parsed)) {
// Ajustar número de colunas para coincidir com headers
if (!empty($headers)) {
while (count($parsed) < count($headers)) {
$parsed[] = '';
}
$parsed = array_slice($parsed, 0, count($headers));
}
$data[] = $parsed;
}
}
return [
'headers' => $headers,
'data' => $data,
'total_rows' => count($data),
];
}
/**
* Check if parser supports the extension
*/
public static function supports(string $extension): bool
{
return in_array(strtolower($extension), self::$supportedExtensions);
}
}