195 lines
5.5 KiB
PHP
195 lines
5.5 KiB
PHP
<?php
|
|
|
|
namespace App\Services\Import;
|
|
|
|
use Smalot\PdfParser\Parser as PdfParserLib;
|
|
|
|
class PdfParser implements FileParserInterface
|
|
{
|
|
protected static array $supportedExtensions = ['pdf'];
|
|
|
|
/**
|
|
* Parse PDF file - extrai texto e tenta identificar tabelas
|
|
*/
|
|
public function parse(string $filePath, array $options = []): array
|
|
{
|
|
$headerRow = $options['header_row'] ?? 0;
|
|
$dataStartRow = $options['data_start_row'] ?? 1;
|
|
|
|
// Extrair texto do PDF
|
|
$lines = $this->extractLines($filePath);
|
|
|
|
// Tentar identificar estrutura tabular
|
|
$parsed = $this->parseTableStructure($lines, $headerRow, $dataStartRow);
|
|
|
|
return $parsed;
|
|
}
|
|
|
|
/**
|
|
* Get headers from PDF
|
|
*/
|
|
public function getHeaders(string $filePath, array $options = []): array
|
|
{
|
|
$headerRow = $options['header_row'] ?? 0;
|
|
$lines = $this->extractLines($filePath);
|
|
|
|
if (isset($lines[$headerRow])) {
|
|
return $this->parseLine($lines[$headerRow]);
|
|
}
|
|
|
|
return [];
|
|
}
|
|
|
|
/**
|
|
* Get preview data
|
|
*/
|
|
public function getPreview(string $filePath, int $rows = 10, array $options = []): array
|
|
{
|
|
$lines = $this->extractLines($filePath);
|
|
|
|
$preview = [];
|
|
$count = 0;
|
|
|
|
foreach ($lines as $index => $line) {
|
|
if ($count >= $rows) {
|
|
break;
|
|
}
|
|
|
|
$parsed = $this->parseLine($line);
|
|
if (!empty($parsed)) {
|
|
$preview[] = [
|
|
'row_index' => $index,
|
|
'data' => $parsed,
|
|
'raw' => $line,
|
|
];
|
|
$count++;
|
|
}
|
|
}
|
|
|
|
return [
|
|
'preview' => $preview,
|
|
'total_rows' => count($lines),
|
|
'columns_count' => !empty($preview) ? count($preview[0]['data']) : 0,
|
|
'raw_text_available' => true,
|
|
];
|
|
}
|
|
|
|
/**
|
|
* Extract lines from PDF
|
|
*/
|
|
protected function extractLines(string $filePath): array
|
|
{
|
|
// Verificar se a biblioteca está disponível
|
|
if (!class_exists(PdfParserLib::class)) {
|
|
// Tentar usar pdftotext (poppler-utils)
|
|
return $this->extractWithPdftotext($filePath);
|
|
}
|
|
|
|
try {
|
|
$parser = new PdfParserLib();
|
|
$pdf = $parser->parseFile($filePath);
|
|
$text = $pdf->getText();
|
|
|
|
// Dividir em linhas
|
|
$lines = explode("\n", $text);
|
|
|
|
// Limpar linhas vazias
|
|
$lines = array_filter($lines, fn($line) => trim($line) !== '');
|
|
|
|
return array_values($lines);
|
|
} catch (\Exception $e) {
|
|
return $this->extractWithPdftotext($filePath);
|
|
}
|
|
}
|
|
|
|
/**
|
|
* Extract using pdftotext command
|
|
*/
|
|
protected function extractWithPdftotext(string $filePath): array
|
|
{
|
|
$output = [];
|
|
$returnVar = 0;
|
|
|
|
// Tentar com layout preservado
|
|
exec("pdftotext -layout " . escapeshellarg($filePath) . " - 2>/dev/null", $output, $returnVar);
|
|
|
|
if ($returnVar !== 0 || empty($output)) {
|
|
// Tentar sem layout
|
|
exec("pdftotext " . escapeshellarg($filePath) . " - 2>/dev/null", $output, $returnVar);
|
|
}
|
|
|
|
if ($returnVar !== 0) {
|
|
throw new \RuntimeException("Could not extract text from PDF. Please install poppler-utils or smalot/pdfparser.");
|
|
}
|
|
|
|
// Filtrar linhas vazias
|
|
return array_values(array_filter($output, fn($line) => trim($line) !== ''));
|
|
}
|
|
|
|
/**
|
|
* Parse a single line into columns
|
|
*/
|
|
protected function parseLine(string $line): array
|
|
{
|
|
// Tentar dividir por múltiplos espaços
|
|
$parts = preg_split('/\s{2,}/', trim($line));
|
|
|
|
if (count($parts) > 1) {
|
|
return array_map('trim', $parts);
|
|
}
|
|
|
|
// Se não funcionou, tentar por tabs
|
|
$parts = explode("\t", $line);
|
|
if (count($parts) > 1) {
|
|
return array_map('trim', $parts);
|
|
}
|
|
|
|
// Retornar linha como único elemento
|
|
return [trim($line)];
|
|
}
|
|
|
|
/**
|
|
* Parse table structure from lines
|
|
*/
|
|
protected function parseTableStructure(array $lines, int $headerRow, int $dataStartRow): array
|
|
{
|
|
$headers = [];
|
|
$data = [];
|
|
|
|
foreach ($lines as $index => $line) {
|
|
$parsed = $this->parseLine($line);
|
|
|
|
if ($index === $headerRow && !empty($parsed)) {
|
|
$headers = $parsed;
|
|
continue;
|
|
}
|
|
|
|
if ($index >= $dataStartRow && !empty($parsed)) {
|
|
// Ajustar número de colunas para coincidir com headers
|
|
if (!empty($headers)) {
|
|
while (count($parsed) < count($headers)) {
|
|
$parsed[] = '';
|
|
}
|
|
$parsed = array_slice($parsed, 0, count($headers));
|
|
}
|
|
|
|
$data[] = $parsed;
|
|
}
|
|
}
|
|
|
|
return [
|
|
'headers' => $headers,
|
|
'data' => $data,
|
|
'total_rows' => count($data),
|
|
];
|
|
}
|
|
|
|
/**
|
|
* Check if parser supports the extension
|
|
*/
|
|
public static function supports(string $extension): bool
|
|
{
|
|
return in_array(strtolower($extension), self::$supportedExtensions);
|
|
}
|
|
}
|