cmc-sales/go/internal/cmc/pdf/counter.go

48 lines
1.3 KiB
Go

package pdf
import (
"fmt"
"os"
"regexp"
"strconv"
)
// CountPDFPages counts the number of pages in a PDF file.
// It reads the PDF and extracts the page count from its internal structure.
func CountPDFPages(filePath string) (int, error) {
// Check if file exists
if _, err := os.Stat(filePath); err != nil {
return 0, fmt.Errorf("file not found: %w", err)
}
// Read the PDF file
data, err := os.ReadFile(filePath)
if err != nil {
return 0, fmt.Errorf("failed to read file: %w", err)
}
// Convert to string for regex matching
content := string(data)
// Try to find /Type /Pages /Count pattern
// This is the most reliable indicator of page count in PDF structure
pageCountRegex := regexp.MustCompile(`/Type\s+/Pages\s+/Kids\s*\[(.*?)\]\s+/Count\s+(\d+)`)
matches := pageCountRegex.FindStringSubmatch(content)
if len(matches) >= 3 {
count, err := strconv.Atoi(matches[2])
if err == nil && count > 0 {
return count, nil
}
}
// Fallback: count individual /Type /Page objects
pageRegex := regexp.MustCompile(`/Type\s+/Page[\s/\(\)]`)
pageMatches := pageRegex.FindAllString(content, -1)
if len(pageMatches) > 0 {
return len(pageMatches), nil
}
// If we can't determine page count, return 0 (indicates error or unknown)
return 0, fmt.Errorf("unable to determine page count")
}