webmoney/backend/app/Services/Import/PdfParser.php

<?php

namespace App\Services\Import;

use Smalot\PdfParser\Parser as PdfParserLib;

class PdfParser implements FileParserInterface
{
    protected static array $supportedExtensions = ['pdf'];

    /**
     * Parse PDF file - extrai texto e tenta identificar tabelas
     */
    public function parse(string $filePath, array $options = []): array
    {
        $headerRow = $options['header_row'] ?? 0;
        $dataStartRow = $options['data_start_row'] ?? 1;

        // Extrair texto do PDF
        $lines = $this->extractLines($filePath);

        // Tentar identificar estrutura tabular
        $parsed = $this->parseTableStructure($lines, $headerRow, $dataStartRow);

        return $parsed;
    }

    /**
     * Get headers from PDF
     */
    public function getHeaders(string $filePath, array $options = []): array
    {
        $headerRow = $options['header_row'] ?? 0;
        $lines = $this->extractLines($filePath);

        if (isset($lines[$headerRow])) {
            return $this->parseLine($lines[$headerRow]);
        }

        return [];
    }

    /**
     * Get preview data
     */
    public function getPreview(string $filePath, int $rows = 10, array $options = []): array
    {
        $lines = $this->extractLines($filePath);

        $preview = [];
        $count = 0;

        foreach ($lines as $index => $line) {
            if ($count >= $rows) {
                break;
            }

            $parsed = $this->parseLine($line);
            if (!empty($parsed)) {
                $preview[] = [
                    'row_index' => $index,
                    'data' => $parsed,
                    'raw' => $line,
                ];
                $count++;
            }
        }

        return [
            'preview' => $preview,
            'total_rows' => count($lines),
            'columns_count' => !empty($preview) ? count($preview[0]['data']) : 0,
            'raw_text_available' => true,
        ];
    }

    /**
     * Extract lines from PDF
     */
    protected function extractLines(string $filePath): array
    {
        // Verificar se a biblioteca está disponível
        if (!class_exists(PdfParserLib::class)) {
            // Tentar usar pdftotext (poppler-utils)
            return $this->extractWithPdftotext($filePath);
        }

        try {
            $parser = new PdfParserLib();
            $pdf = $parser->parseFile($filePath);
            $text = $pdf->getText();

            // Dividir em linhas
            $lines = explode("\n", $text);

            // Limpar linhas vazias
            $lines = array_filter($lines, fn($line) => trim($line) !== '');

            return array_values($lines);
        } catch (\Exception $e) {
            return $this->extractWithPdftotext($filePath);
        }
    }

    /**
     * Extract using pdftotext command
     */
    protected function extractWithPdftotext(string $filePath): array
    {
        $output = [];
        $returnVar = 0;

        // Tentar com layout preservado
        exec("pdftotext -layout " . escapeshellarg($filePath) . " - 2>/dev/null", $output, $returnVar);

        if ($returnVar !== 0 || empty($output)) {
            // Tentar sem layout
            exec("pdftotext " . escapeshellarg($filePath) . " - 2>/dev/null", $output, $returnVar);
        }

        if ($returnVar !== 0) {
            throw new \RuntimeException("Could not extract text from PDF. Please install poppler-utils or smalot/pdfparser.");
        }

        // Filtrar linhas vazias
        return array_values(array_filter($output, fn($line) => trim($line) !== ''));
    }

    /**
     * Parse a single line into columns
     */
    protected function parseLine(string $line): array
    {
        // Tentar dividir por múltiplos espaços
        $parts = preg_split('/\s{2,}/', trim($line));

        if (count($parts) > 1) {
            return array_map('trim', $parts);
        }

        // Se não funcionou, tentar por tabs
        $parts = explode("\t", $line);
        if (count($parts) > 1) {
            return array_map('trim', $parts);
        }

        // Retornar linha como único elemento
        return [trim($line)];
    }

    /**
     * Parse table structure from lines
     */
    protected function parseTableStructure(array $lines, int $headerRow, int $dataStartRow): array
    {
        $headers = [];
        $data = [];

        foreach ($lines as $index => $line) {
            $parsed = $this->parseLine($line);

            if ($index === $headerRow && !empty($parsed)) {
                $headers = $parsed;
                continue;
            }

            if ($index >= $dataStartRow && !empty($parsed)) {
                // Ajustar número de colunas para coincidir com headers
                if (!empty($headers)) {
                    while (count($parsed) < count($headers)) {
                        $parsed[] = '';
                    }
                    $parsed = array_slice($parsed, 0, count($headers));
                }

                $data[] = $parsed;
            }
        }

        return [
            'headers' => $headers,
            'data' => $data,
            'total_rows' => count($data),
        ];
    }

    /**
     * Check if parser supports the extension
     */
    public static function supports(string $extension): bool
    {
        return in_array(strtolower($extension), self::$supportedExtensions);
    }
}