457 lines
13 KiB
Go
457 lines
13 KiB
Go
package services
|
|
|
|
import (
|
|
"bytes"
|
|
"fmt"
|
|
"strconv"
|
|
"strings"
|
|
"testing"
|
|
)
|
|
|
|
// ==================== Section 1.1: Valid PDF Files ====================
|
|
|
|
// Test 1.1.1: Single-page PDF extraction
|
|
func TestExtractPDFText_SinglePage(t *testing.T) {
|
|
content := "Single Page Resume\nSoftware Engineer with 5 years of experience."
|
|
testPDF := createSimplePDF(content)
|
|
reader := bytes.NewReader(testPDF)
|
|
|
|
text, err := extractPDFText(reader)
|
|
if err != nil {
|
|
t.Errorf("Test 1.1.1 FAILED: Unexpected error: %v", err)
|
|
return
|
|
}
|
|
|
|
if text == "" {
|
|
t.Error("Test 1.1.1 FAILED: Empty text extracted")
|
|
return
|
|
}
|
|
|
|
if !strings.Contains(text, "Single Page Resume") || !strings.Contains(text, "Software Engineer") {
|
|
t.Errorf("Test 1.1.1 FAILED: Expected key content not found. Extracted text: %q", text)
|
|
return
|
|
}
|
|
|
|
t.Log("Test 1.1.1 PASSED: Single-page PDF extracted successfully")
|
|
}
|
|
|
|
// Test 1.1.2: Multi-page PDF extraction
|
|
func TestExtractPDFText_MultiPage(t *testing.T) {
|
|
testPDF := createMultiPagePDF(3, "Page content for resume")
|
|
reader := bytes.NewReader(testPDF)
|
|
|
|
text, err := extractPDFText(reader)
|
|
if err != nil {
|
|
t.Errorf("Test 1.1.2 FAILED: Unexpected error: %v", err)
|
|
return
|
|
}
|
|
|
|
if text == "" {
|
|
t.Error("Test 1.1.2 FAILED: Empty text extracted")
|
|
return
|
|
}
|
|
|
|
page1 := "Page content for resume page 1"
|
|
page2 := "Page content for resume page 2"
|
|
page3 := "Page content for resume page 3"
|
|
|
|
if !strings.Contains(text, page1) || !strings.Contains(text, page2) || !strings.Contains(text, page3) {
|
|
t.Errorf("Test 1.1.2 FAILED: Missing expected page content. Extracted text: %q", text)
|
|
return
|
|
}
|
|
|
|
if !(strings.Index(text, page1) < strings.Index(text, page2) && strings.Index(text, page2) < strings.Index(text, page3)) {
|
|
t.Errorf("Test 1.1.2 FAILED: Page order not preserved. Extracted text: %q", text)
|
|
return
|
|
}
|
|
|
|
t.Log("Test 1.1.2 PASSED: Multi-page PDF extracted successfully")
|
|
}
|
|
|
|
// Test 1.1.3: PDF with special characters
|
|
func TestExtractPDFText_SpecialCharacters(t *testing.T) {
|
|
specialChars := "Resume with special chars: é, ñ, ü, ®, ©, € and symbols: @#$%^&*()"
|
|
testPDF := createSimplePDF(specialChars)
|
|
reader := bytes.NewReader(testPDF)
|
|
|
|
text, err := extractPDFText(reader)
|
|
if err != nil {
|
|
t.Errorf("Test 1.1.3 FAILED: Unexpected error: %v", err)
|
|
return
|
|
}
|
|
|
|
if text == "" {
|
|
t.Error("Test 1.1.3 FAILED: Empty text extracted")
|
|
return
|
|
}
|
|
|
|
if !strings.Contains(text, "special chars") || !strings.Contains(text, "@#$%^&*") {
|
|
t.Errorf("Test 1.1.3 FAILED: Expected special-character content not found. Extracted text: %q", text)
|
|
return
|
|
}
|
|
|
|
t.Log("Test 1.1.3 PASSED: PDF with special characters extracted successfully")
|
|
}
|
|
|
|
// Test 1.1.4: PDF with tables and formatting
|
|
func TestExtractPDFText_FormattedContent(t *testing.T) {
|
|
content := "Work Experience\n2020-2024 Senior Engineer at TechCorp\nResponsibilities:\n- Led team\n- Delivered projects\n- Mentored juniors"
|
|
testPDF := createSimplePDF(content)
|
|
reader := bytes.NewReader(testPDF)
|
|
|
|
text, err := extractPDFText(reader)
|
|
if err != nil {
|
|
t.Errorf("Test 1.1.4 FAILED: Unexpected error: %v", err)
|
|
return
|
|
}
|
|
|
|
if text == "" {
|
|
t.Error("Test 1.1.4 FAILED: Empty text extracted")
|
|
return
|
|
}
|
|
|
|
if !strings.Contains(text, "Work Experience") || !strings.Contains(text, "Responsibilities") || !strings.Contains(text, "Mentored juniors") {
|
|
t.Errorf("Test 1.1.4 FAILED: Expected formatted content missing. Extracted text: %q", text)
|
|
return
|
|
}
|
|
|
|
t.Log("Test 1.1.4 PASSED: Formatted content extracted successfully")
|
|
}
|
|
|
|
// ==================== Section 1.2: Invalid PDF Files ====================
|
|
|
|
// Test 1.2.1: Non-PDF file (DOCX)
|
|
func TestExtractPDFText_NonPDFDOCX(t *testing.T) {
|
|
// Create fake DOCX data (just random bytes)
|
|
fakeDOCX := []byte("PK\x03\x04" + "not a real docx file")
|
|
reader := bytes.NewReader(fakeDOCX)
|
|
|
|
_, err := extractPDFText(reader)
|
|
if err == nil {
|
|
t.Error("Test 1.2.1 FAILED: Expected error for non-PDF file, got nil")
|
|
return
|
|
}
|
|
|
|
if !strings.Contains(err.Error(), "not a PDF file") {
|
|
t.Errorf("Test 1.2.1 FAILED: Expected non-PDF error, got: %v", err)
|
|
return
|
|
}
|
|
|
|
t.Logf("Test 1.2.1 PASSED: Non-PDF DOCX rejected with error: %v", err)
|
|
}
|
|
|
|
// Test 1.2.2: Non-PDF file (JPEG)
|
|
func TestExtractPDFText_NonPDFJPEG(t *testing.T) {
|
|
// Create fake JPEG data
|
|
fakeJPEG := []byte("\xff\xd8\xff\xe0" + "not a real jpeg")
|
|
reader := bytes.NewReader(fakeJPEG)
|
|
|
|
_, err := extractPDFText(reader)
|
|
if err == nil {
|
|
t.Error("Test 1.2.2 FAILED: Expected error for JPEG file, got nil")
|
|
return
|
|
}
|
|
|
|
if !strings.Contains(err.Error(), "not a PDF file") {
|
|
t.Errorf("Test 1.2.2 FAILED: Expected non-PDF error, got: %v", err)
|
|
return
|
|
}
|
|
|
|
t.Logf("Test 1.2.2 PASSED: Non-PDF JPEG rejected with error: %v", err)
|
|
}
|
|
|
|
// Test 1.2.3: Corrupted PDF
|
|
func TestExtractPDFText_CorruptedPDF(t *testing.T) {
|
|
// Start with valid PDF header but corrupt the content
|
|
corruptedPDF := []byte("%PDF-1.4\n" + "corrupted binary data \x00\x01\x02\x03")
|
|
reader := bytes.NewReader(corruptedPDF)
|
|
|
|
_, err := extractPDFText(reader)
|
|
if err == nil {
|
|
t.Error("Test 1.2.3 FAILED: Expected error for corrupted PDF, got nil")
|
|
return
|
|
}
|
|
|
|
if !strings.Contains(err.Error(), "not a PDF file") {
|
|
t.Errorf("Test 1.2.3 FAILED: Expected parse error, got: %v", err)
|
|
return
|
|
}
|
|
|
|
t.Logf("Test 1.2.3 PASSED: Corrupted PDF rejected with error: %v", err)
|
|
}
|
|
|
|
// Test 1.2.4: Empty PDF (0 bytes)
|
|
func TestExtractPDFText_EmptyPDF(t *testing.T) {
|
|
emptyData := []byte{}
|
|
reader := bytes.NewReader(emptyData)
|
|
|
|
_, err := extractPDFText(reader)
|
|
if err == nil {
|
|
t.Error("Test 1.2.4 FAILED: Expected error for empty PDF, got nil")
|
|
return
|
|
}
|
|
|
|
if !strings.Contains(err.Error(), "not a PDF file") {
|
|
t.Errorf("Test 1.2.4 FAILED: Expected parse error, got: %v", err)
|
|
return
|
|
}
|
|
|
|
t.Logf("Test 1.2.4 PASSED: Empty PDF rejected with error: %v", err)
|
|
}
|
|
|
|
// Test 1.2.5: PDF with no text (image-only)
|
|
func TestExtractPDFText_ImageOnlyPDF(t *testing.T) {
|
|
testPDF := createMinimalPDF()
|
|
reader := bytes.NewReader(testPDF)
|
|
|
|
text, err := extractPDFText(reader)
|
|
if err != nil {
|
|
t.Errorf("Test 1.2.5 FAILED: Expected no error for image-only/minimal PDF, got: %v", err)
|
|
return
|
|
}
|
|
|
|
if strings.TrimSpace(text) != "" {
|
|
t.Errorf("Test 1.2.5 FAILED: Expected empty/minimal text, got: %q", text)
|
|
return
|
|
}
|
|
|
|
t.Logf("Test 1.2.5 PASSED: Image-only PDF returned text: %q", text)
|
|
}
|
|
|
|
// Test 1.2.6: Password-protected PDF
|
|
func TestExtractPDFText_PasswordProtectedPDF(t *testing.T) {
|
|
// Note: Creating a true encrypted PDF is complex
|
|
// We'll test with a PDF-like structure that would fail parsing
|
|
// For now, we'll skip this test or use a mock
|
|
t.Skip("Test 1.2.6 SKIPPED: Password-protected PDF creation requires specialized library")
|
|
}
|
|
|
|
// Test 1.2.7: Null/empty reader
|
|
func TestExtractPDFText_NullReader(t *testing.T) {
|
|
_, err := extractPDFText(bytes.NewReader([]byte{}))
|
|
if err == nil {
|
|
t.Error("Test 1.2.7 FAILED: Expected error for empty reader, got nil")
|
|
return
|
|
}
|
|
|
|
if !strings.Contains(err.Error(), "not a PDF file") {
|
|
t.Errorf("Test 1.2.7 FAILED: Expected parse error, got: %v", err)
|
|
return
|
|
}
|
|
|
|
t.Logf("Test 1.2.7 PASSED: Empty reader rejected with error: %v", err)
|
|
}
|
|
|
|
// ==================== Section 1.3: PDF Format Variations ====================
|
|
|
|
// Test 1.3.1: PDF version 1.4
|
|
func TestExtractPDFText_PDFVersion14(t *testing.T) {
|
|
testPDF := createPDFWithVersion("1.4", "Content for PDF 1.4")
|
|
reader := bytes.NewReader(testPDF)
|
|
|
|
text, err := extractPDFText(reader)
|
|
if err != nil {
|
|
t.Errorf("Test 1.3.1 FAILED: Unexpected error: %v", err)
|
|
return
|
|
}
|
|
|
|
if text == "" {
|
|
t.Error("Test 1.3.1 FAILED: Empty text extracted")
|
|
return
|
|
}
|
|
|
|
if !strings.Contains(text, "Content for PDF 1.4") {
|
|
t.Errorf("Test 1.3.1 FAILED: Expected version test content not found. Extracted text: %q", text)
|
|
return
|
|
}
|
|
|
|
t.Log("Test 1.3.1 PASSED: PDF 1.4 extracted successfully")
|
|
}
|
|
|
|
// Test 1.3.2: PDF version 1.7
|
|
func TestExtractPDFText_PDFVersion17(t *testing.T) {
|
|
testPDF := createPDFWithVersion("1.7", "Content for PDF 1.7")
|
|
reader := bytes.NewReader(testPDF)
|
|
|
|
text, err := extractPDFText(reader)
|
|
if err != nil {
|
|
t.Errorf("Test 1.3.2 FAILED: Unexpected error: %v", err)
|
|
return
|
|
}
|
|
|
|
if text == "" {
|
|
t.Error("Test 1.3.2 FAILED: Empty text extracted")
|
|
return
|
|
}
|
|
|
|
if !strings.Contains(text, "Content for PDF 1.7") {
|
|
t.Errorf("Test 1.3.2 FAILED: Expected version test content not found. Extracted text: %q", text)
|
|
return
|
|
}
|
|
|
|
t.Log("Test 1.3.2 PASSED: PDF 1.7 extracted successfully")
|
|
}
|
|
|
|
// Test 1.3.3: Very large PDF (100+ pages) - Benchmark
|
|
func TestExtractPDFText_LargePDF(t *testing.T) {
|
|
testPDF := createMultiPagePDF(100, "Resume content for performance testing")
|
|
reader := bytes.NewReader(testPDF)
|
|
|
|
text, err := extractPDFText(reader)
|
|
if err != nil {
|
|
t.Errorf("Test 1.3.3 FAILED: Unexpected error: %v", err)
|
|
return
|
|
}
|
|
|
|
if text == "" {
|
|
t.Error("Test 1.3.3 FAILED: Empty text extracted from large PDF")
|
|
return
|
|
}
|
|
|
|
firstPage := "Resume content for performance testing page 1"
|
|
lastPage := "Resume content for performance testing page 100"
|
|
if !strings.Contains(text, firstPage) || !strings.Contains(text, lastPage) {
|
|
t.Errorf("Test 1.3.3 FAILED: Missing first/last page content in large PDF extraction")
|
|
return
|
|
}
|
|
|
|
t.Logf("Test 1.3.3 PASSED: Large PDF (100 pages) extracted successfully. Text length: %d", len(text))
|
|
}
|
|
|
|
func TestExtractPDFText_10_2_3_PDFWith1000Pages(t *testing.T) {
|
|
testPDF := createMultiPagePDF(1000, "Boundary PDF content")
|
|
reader := bytes.NewReader(testPDF)
|
|
|
|
text, err := extractPDFText(reader)
|
|
if err != nil {
|
|
t.Fatalf("Test 10.2.3 FAILED: Unexpected error for 1000-page PDF: %v", err)
|
|
}
|
|
|
|
if !strings.Contains(text, "Boundary PDF content page 1") || !strings.Contains(text, "Boundary PDF content page 1000") {
|
|
t.Fatalf("Test 10.2.3 FAILED: Missing first/last page content in 1000-page extraction")
|
|
}
|
|
}
|
|
|
|
// ==================== Helper Functions ====================
|
|
|
|
// createSimplePDF creates a valid single-page PDF with extractable text.
|
|
func createSimplePDF(content string) []byte {
|
|
if strings.TrimSpace(content) == "" {
|
|
content = "Sample resume content"
|
|
}
|
|
|
|
return createPDF("1.4", []string{content})
|
|
}
|
|
|
|
// createMinimalPDF creates a valid PDF with no text stream.
|
|
func createMinimalPDF() []byte {
|
|
return createPDF("1.4", []string{""})
|
|
}
|
|
|
|
// createMultiPagePDF creates a valid multi-page PDF with extractable text.
|
|
func createMultiPagePDF(pages int, content string) []byte {
|
|
if pages < 1 {
|
|
pages = 1
|
|
}
|
|
if strings.TrimSpace(content) == "" {
|
|
content = "Sample resume content"
|
|
}
|
|
|
|
pageTexts := make([]string, pages)
|
|
for i := 0; i < pages; i++ {
|
|
pageTexts[i] = fmt.Sprintf("%s page %d", content, i+1)
|
|
}
|
|
|
|
return createPDF("1.4", pageTexts)
|
|
}
|
|
|
|
// createPDFWithVersion creates a PDF with specific version
|
|
func createPDFWithVersion(version string, content string) []byte {
|
|
if strings.TrimSpace(content) == "" {
|
|
content = "Sample resume content"
|
|
}
|
|
|
|
return createPDF(version, []string{content})
|
|
}
|
|
|
|
func createPDF(version string, pageTexts []string) []byte {
|
|
if strings.TrimSpace(version) == "" {
|
|
version = "1.4"
|
|
}
|
|
if len(pageTexts) == 0 {
|
|
pageTexts = []string{"Sample resume content"}
|
|
}
|
|
|
|
buf := bytes.NewBuffer(nil)
|
|
buf.WriteString("%PDF-")
|
|
buf.WriteString(version)
|
|
buf.WriteString("\n")
|
|
|
|
offsets := []int{0}
|
|
writeObj := func(objNum int, body string) {
|
|
offsets = append(offsets, buf.Len())
|
|
buf.WriteString(strconv.Itoa(objNum))
|
|
buf.WriteString(" 0 obj\n")
|
|
buf.WriteString(body)
|
|
buf.WriteString("\nendobj\n")
|
|
}
|
|
|
|
pageCount := len(pageTexts)
|
|
fontObjNum := 3 + (pageCount * 2)
|
|
|
|
writeObj(1, "<</Type /Catalog /Pages 2 0 R>>")
|
|
|
|
var kids strings.Builder
|
|
kids.WriteString("[")
|
|
for i := range pageCount {
|
|
if i > 0 {
|
|
kids.WriteString(" ")
|
|
}
|
|
pageObjNum := 3 + (i * 2)
|
|
kids.WriteString(strconv.Itoa(pageObjNum))
|
|
kids.WriteString(" 0 R")
|
|
}
|
|
kids.WriteString("]")
|
|
writeObj(2, fmt.Sprintf("<</Type /Pages /Kids %s /Count %d>>", kids.String(), pageCount))
|
|
|
|
for i, pageText := range pageTexts {
|
|
pageObjNum := 3 + (i * 2)
|
|
contentObjNum := pageObjNum + 1
|
|
|
|
writeObj(pageObjNum,
|
|
fmt.Sprintf("<</Type /Page /Parent 2 0 R /MediaBox [0 0 612 792] /Resources <</Font <</F1 %d 0 R>>>> /Contents %d 0 R>>", fontObjNum, contentObjNum),
|
|
)
|
|
|
|
escaped := escapePDFText(pageText)
|
|
stream := fmt.Sprintf("BT\n/F1 12 Tf\n72 720 Td\n(%s) Tj\nET\n", escaped)
|
|
writeObj(contentObjNum, fmt.Sprintf("<</Length %d>>\nstream\n%sendstream", len(stream), stream))
|
|
}
|
|
|
|
writeObj(fontObjNum, "<</Type /Font /Subtype /Type1 /BaseFont /Helvetica>>")
|
|
|
|
xrefOffset := buf.Len()
|
|
buf.WriteString("xref\n")
|
|
fmt.Fprintf(buf, "0 %d\n", len(offsets))
|
|
buf.WriteString("0000000000 65535 f \n")
|
|
for i := 1; i < len(offsets); i++ {
|
|
fmt.Fprintf(buf, "%010d 00000 n \n", offsets[i])
|
|
}
|
|
|
|
buf.WriteString("trailer\n")
|
|
fmt.Fprintf(buf, "<</Size %d /Root 1 0 R>>\n", len(offsets))
|
|
buf.WriteString("startxref\n")
|
|
fmt.Fprintf(buf, "%d\n", xrefOffset)
|
|
buf.WriteString("%%EOF")
|
|
|
|
return buf.Bytes()
|
|
}
|
|
|
|
func escapePDFText(s string) string {
|
|
s = strings.ReplaceAll(s, "\\", "\\\\")
|
|
s = strings.ReplaceAll(s, "(", "\\(")
|
|
s = strings.ReplaceAll(s, ")", "\\)")
|
|
s = strings.ReplaceAll(s, "\n", " ")
|
|
s = strings.ReplaceAll(s, "\r", " ")
|
|
return s
|
|
}
|