extractLines($filePath); // Tentar identificar estrutura tabular $parsed = $this->parseTableStructure($lines, $headerRow, $dataStartRow); return $parsed; } /** * Get headers from PDF */ public function getHeaders(string $filePath, array $options = []): array { $headerRow = $options['header_row'] ?? 0; $lines = $this->extractLines($filePath); if (isset($lines[$headerRow])) { return $this->parseLine($lines[$headerRow]); } return []; } /** * Get preview data */ public function getPreview(string $filePath, int $rows = 10, array $options = []): array { $lines = $this->extractLines($filePath); $preview = []; $count = 0; foreach ($lines as $index => $line) { if ($count >= $rows) { break; } $parsed = $this->parseLine($line); if (!empty($parsed)) { $preview[] = [ 'row_index' => $index, 'data' => $parsed, 'raw' => $line, ]; $count++; } } return [ 'preview' => $preview, 'total_rows' => count($lines), 'columns_count' => !empty($preview) ? count($preview[0]['data']) : 0, 'raw_text_available' => true, ]; } /** * Extract lines from PDF */ protected function extractLines(string $filePath): array { // Verificar se a biblioteca está disponível if (!class_exists(PdfParserLib::class)) { // Tentar usar pdftotext (poppler-utils) return $this->extractWithPdftotext($filePath); } try { $parser = new PdfParserLib(); $pdf = $parser->parseFile($filePath); $text = $pdf->getText(); // Dividir em linhas $lines = explode("\n", $text); // Limpar linhas vazias $lines = array_filter($lines, fn($line) => trim($line) !== ''); return array_values($lines); } catch (\Exception $e) { return $this->extractWithPdftotext($filePath); } } /** * Extract using pdftotext command */ protected function extractWithPdftotext(string $filePath): array { $output = []; $returnVar = 0; // Tentar com layout preservado exec("pdftotext -layout " . escapeshellarg($filePath) . " - 2>/dev/null", $output, $returnVar); if ($returnVar !== 0 || empty($output)) { // Tentar sem layout exec("pdftotext " . escapeshellarg($filePath) . " - 2>/dev/null", $output, $returnVar); } if ($returnVar !== 0) { throw new \RuntimeException("Could not extract text from PDF. Please install poppler-utils or smalot/pdfparser."); } // Filtrar linhas vazias return array_values(array_filter($output, fn($line) => trim($line) !== '')); } /** * Parse a single line into columns */ protected function parseLine(string $line): array { // Tentar dividir por múltiplos espaços $parts = preg_split('/\s{2,}/', trim($line)); if (count($parts) > 1) { return array_map('trim', $parts); } // Se não funcionou, tentar por tabs $parts = explode("\t", $line); if (count($parts) > 1) { return array_map('trim', $parts); } // Retornar linha como único elemento return [trim($line)]; } /** * Parse table structure from lines */ protected function parseTableStructure(array $lines, int $headerRow, int $dataStartRow): array { $headers = []; $data = []; foreach ($lines as $index => $line) { $parsed = $this->parseLine($line); if ($index === $headerRow && !empty($parsed)) { $headers = $parsed; continue; } if ($index >= $dataStartRow && !empty($parsed)) { // Ajustar número de colunas para coincidir com headers if (!empty($headers)) { while (count($parsed) < count($headers)) { $parsed[] = ''; } $parsed = array_slice($parsed, 0, count($headers)); } $data[] = $parsed; } } return [ 'headers' => $headers, 'data' => $data, 'total_rows' => count($data), ]; } /** * Check if parser supports the extension */ public static function supports(string $extension): bool { return in_array(strtolower($extension), self::$supportedExtensions); } }