package services import ( "bytes" "fmt" "strconv" "strings" "testing" ) // ==================== Section 1.1: Valid PDF Files ==================== // Test 1.1.1: Single-page PDF extraction func TestExtractPDFText_SinglePage(t *testing.T) { content := "Single Page Resume\nSoftware Engineer with 5 years of experience." testPDF := createSimplePDF(content) reader := bytes.NewReader(testPDF) text, err := extractPDFText(reader) if err != nil { t.Errorf("Test 1.1.1 FAILED: Unexpected error: %v", err) return } if text == "" { t.Error("Test 1.1.1 FAILED: Empty text extracted") return } if !strings.Contains(text, "Single Page Resume") || !strings.Contains(text, "Software Engineer") { t.Errorf("Test 1.1.1 FAILED: Expected key content not found. Extracted text: %q", text) return } t.Log("Test 1.1.1 PASSED: Single-page PDF extracted successfully") } // Test 1.1.2: Multi-page PDF extraction func TestExtractPDFText_MultiPage(t *testing.T) { testPDF := createMultiPagePDF(3, "Page content for resume") reader := bytes.NewReader(testPDF) text, err := extractPDFText(reader) if err != nil { t.Errorf("Test 1.1.2 FAILED: Unexpected error: %v", err) return } if text == "" { t.Error("Test 1.1.2 FAILED: Empty text extracted") return } page1 := "Page content for resume page 1" page2 := "Page content for resume page 2" page3 := "Page content for resume page 3" if !strings.Contains(text, page1) || !strings.Contains(text, page2) || !strings.Contains(text, page3) { t.Errorf("Test 1.1.2 FAILED: Missing expected page content. Extracted text: %q", text) return } if !(strings.Index(text, page1) < strings.Index(text, page2) && strings.Index(text, page2) < strings.Index(text, page3)) { t.Errorf("Test 1.1.2 FAILED: Page order not preserved. Extracted text: %q", text) return } t.Log("Test 1.1.2 PASSED: Multi-page PDF extracted successfully") } // Test 1.1.3: PDF with special characters func TestExtractPDFText_SpecialCharacters(t *testing.T) { specialChars := "Resume with special chars: é, ñ, ü, ®, ©, € and symbols: @#$%^&*()" testPDF := createSimplePDF(specialChars) reader := bytes.NewReader(testPDF) text, err := extractPDFText(reader) if err != nil { t.Errorf("Test 1.1.3 FAILED: Unexpected error: %v", err) return } if text == "" { t.Error("Test 1.1.3 FAILED: Empty text extracted") return } if !strings.Contains(text, "special chars") || !strings.Contains(text, "@#$%^&*") { t.Errorf("Test 1.1.3 FAILED: Expected special-character content not found. Extracted text: %q", text) return } t.Log("Test 1.1.3 PASSED: PDF with special characters extracted successfully") } // Test 1.1.4: PDF with tables and formatting func TestExtractPDFText_FormattedContent(t *testing.T) { content := "Work Experience\n2020-2024 Senior Engineer at TechCorp\nResponsibilities:\n- Led team\n- Delivered projects\n- Mentored juniors" testPDF := createSimplePDF(content) reader := bytes.NewReader(testPDF) text, err := extractPDFText(reader) if err != nil { t.Errorf("Test 1.1.4 FAILED: Unexpected error: %v", err) return } if text == "" { t.Error("Test 1.1.4 FAILED: Empty text extracted") return } if !strings.Contains(text, "Work Experience") || !strings.Contains(text, "Responsibilities") || !strings.Contains(text, "Mentored juniors") { t.Errorf("Test 1.1.4 FAILED: Expected formatted content missing. Extracted text: %q", text) return } t.Log("Test 1.1.4 PASSED: Formatted content extracted successfully") } // ==================== Section 1.2: Invalid PDF Files ==================== // Test 1.2.1: Non-PDF file (DOCX) func TestExtractPDFText_NonPDFDOCX(t *testing.T) { // Create fake DOCX data (just random bytes) fakeDOCX := []byte("PK\x03\x04" + "not a real docx file") reader := bytes.NewReader(fakeDOCX) _, err := extractPDFText(reader) if err == nil { t.Error("Test 1.2.1 FAILED: Expected error for non-PDF file, got nil") return } if !strings.Contains(err.Error(), "not a PDF file") { t.Errorf("Test 1.2.1 FAILED: Expected non-PDF error, got: %v", err) return } t.Logf("Test 1.2.1 PASSED: Non-PDF DOCX rejected with error: %v", err) } // Test 1.2.2: Non-PDF file (JPEG) func TestExtractPDFText_NonPDFJPEG(t *testing.T) { // Create fake JPEG data fakeJPEG := []byte("\xff\xd8\xff\xe0" + "not a real jpeg") reader := bytes.NewReader(fakeJPEG) _, err := extractPDFText(reader) if err == nil { t.Error("Test 1.2.2 FAILED: Expected error for JPEG file, got nil") return } if !strings.Contains(err.Error(), "not a PDF file") { t.Errorf("Test 1.2.2 FAILED: Expected non-PDF error, got: %v", err) return } t.Logf("Test 1.2.2 PASSED: Non-PDF JPEG rejected with error: %v", err) } // Test 1.2.3: Corrupted PDF func TestExtractPDFText_CorruptedPDF(t *testing.T) { // Start with valid PDF header but corrupt the content corruptedPDF := []byte("%PDF-1.4\n" + "corrupted binary data \x00\x01\x02\x03") reader := bytes.NewReader(corruptedPDF) _, err := extractPDFText(reader) if err == nil { t.Error("Test 1.2.3 FAILED: Expected error for corrupted PDF, got nil") return } if !strings.Contains(err.Error(), "not a PDF file") { t.Errorf("Test 1.2.3 FAILED: Expected parse error, got: %v", err) return } t.Logf("Test 1.2.3 PASSED: Corrupted PDF rejected with error: %v", err) } // Test 1.2.4: Empty PDF (0 bytes) func TestExtractPDFText_EmptyPDF(t *testing.T) { emptyData := []byte{} reader := bytes.NewReader(emptyData) _, err := extractPDFText(reader) if err == nil { t.Error("Test 1.2.4 FAILED: Expected error for empty PDF, got nil") return } if !strings.Contains(err.Error(), "not a PDF file") { t.Errorf("Test 1.2.4 FAILED: Expected parse error, got: %v", err) return } t.Logf("Test 1.2.4 PASSED: Empty PDF rejected with error: %v", err) } // Test 1.2.5: PDF with no text (image-only) func TestExtractPDFText_ImageOnlyPDF(t *testing.T) { testPDF := createMinimalPDF() reader := bytes.NewReader(testPDF) text, err := extractPDFText(reader) if err != nil { t.Errorf("Test 1.2.5 FAILED: Expected no error for image-only/minimal PDF, got: %v", err) return } if strings.TrimSpace(text) != "" { t.Errorf("Test 1.2.5 FAILED: Expected empty/minimal text, got: %q", text) return } t.Logf("Test 1.2.5 PASSED: Image-only PDF returned text: %q", text) } // Test 1.2.6: Password-protected PDF func TestExtractPDFText_PasswordProtectedPDF(t *testing.T) { // Note: Creating a true encrypted PDF is complex // We'll test with a PDF-like structure that would fail parsing // For now, we'll skip this test or use a mock t.Skip("Test 1.2.6 SKIPPED: Password-protected PDF creation requires specialized library") } // Test 1.2.7: Null/empty reader func TestExtractPDFText_NullReader(t *testing.T) { _, err := extractPDFText(bytes.NewReader([]byte{})) if err == nil { t.Error("Test 1.2.7 FAILED: Expected error for empty reader, got nil") return } if !strings.Contains(err.Error(), "not a PDF file") { t.Errorf("Test 1.2.7 FAILED: Expected parse error, got: %v", err) return } t.Logf("Test 1.2.7 PASSED: Empty reader rejected with error: %v", err) } // ==================== Section 1.3: PDF Format Variations ==================== // Test 1.3.1: PDF version 1.4 func TestExtractPDFText_PDFVersion14(t *testing.T) { testPDF := createPDFWithVersion("1.4", "Content for PDF 1.4") reader := bytes.NewReader(testPDF) text, err := extractPDFText(reader) if err != nil { t.Errorf("Test 1.3.1 FAILED: Unexpected error: %v", err) return } if text == "" { t.Error("Test 1.3.1 FAILED: Empty text extracted") return } if !strings.Contains(text, "Content for PDF 1.4") { t.Errorf("Test 1.3.1 FAILED: Expected version test content not found. Extracted text: %q", text) return } t.Log("Test 1.3.1 PASSED: PDF 1.4 extracted successfully") } // Test 1.3.2: PDF version 1.7 func TestExtractPDFText_PDFVersion17(t *testing.T) { testPDF := createPDFWithVersion("1.7", "Content for PDF 1.7") reader := bytes.NewReader(testPDF) text, err := extractPDFText(reader) if err != nil { t.Errorf("Test 1.3.2 FAILED: Unexpected error: %v", err) return } if text == "" { t.Error("Test 1.3.2 FAILED: Empty text extracted") return } if !strings.Contains(text, "Content for PDF 1.7") { t.Errorf("Test 1.3.2 FAILED: Expected version test content not found. Extracted text: %q", text) return } t.Log("Test 1.3.2 PASSED: PDF 1.7 extracted successfully") } // Test 1.3.3: Very large PDF (100+ pages) - Benchmark func TestExtractPDFText_LargePDF(t *testing.T) { testPDF := createMultiPagePDF(100, "Resume content for performance testing") reader := bytes.NewReader(testPDF) text, err := extractPDFText(reader) if err != nil { t.Errorf("Test 1.3.3 FAILED: Unexpected error: %v", err) return } if text == "" { t.Error("Test 1.3.3 FAILED: Empty text extracted from large PDF") return } firstPage := "Resume content for performance testing page 1" lastPage := "Resume content for performance testing page 100" if !strings.Contains(text, firstPage) || !strings.Contains(text, lastPage) { t.Errorf("Test 1.3.3 FAILED: Missing first/last page content in large PDF extraction") return } t.Logf("Test 1.3.3 PASSED: Large PDF (100 pages) extracted successfully. Text length: %d", len(text)) } func TestExtractPDFText_10_2_3_PDFWith1000Pages(t *testing.T) { testPDF := createMultiPagePDF(1000, "Boundary PDF content") reader := bytes.NewReader(testPDF) text, err := extractPDFText(reader) if err != nil { t.Fatalf("Test 10.2.3 FAILED: Unexpected error for 1000-page PDF: %v", err) } if !strings.Contains(text, "Boundary PDF content page 1") || !strings.Contains(text, "Boundary PDF content page 1000") { t.Fatalf("Test 10.2.3 FAILED: Missing first/last page content in 1000-page extraction") } } // ==================== Helper Functions ==================== // createSimplePDF creates a valid single-page PDF with extractable text. func createSimplePDF(content string) []byte { if strings.TrimSpace(content) == "" { content = "Sample resume content" } return createPDF("1.4", []string{content}) } // createMinimalPDF creates a valid PDF with no text stream. func createMinimalPDF() []byte { return createPDF("1.4", []string{""}) } // createMultiPagePDF creates a valid multi-page PDF with extractable text. func createMultiPagePDF(pages int, content string) []byte { if pages < 1 { pages = 1 } if strings.TrimSpace(content) == "" { content = "Sample resume content" } pageTexts := make([]string, pages) for i := 0; i < pages; i++ { pageTexts[i] = fmt.Sprintf("%s page %d", content, i+1) } return createPDF("1.4", pageTexts) } // createPDFWithVersion creates a PDF with specific version func createPDFWithVersion(version string, content string) []byte { if strings.TrimSpace(content) == "" { content = "Sample resume content" } return createPDF(version, []string{content}) } func createPDF(version string, pageTexts []string) []byte { if strings.TrimSpace(version) == "" { version = "1.4" } if len(pageTexts) == 0 { pageTexts = []string{"Sample resume content"} } buf := bytes.NewBuffer(nil) buf.WriteString("%PDF-") buf.WriteString(version) buf.WriteString("\n") offsets := []int{0} writeObj := func(objNum int, body string) { offsets = append(offsets, buf.Len()) buf.WriteString(strconv.Itoa(objNum)) buf.WriteString(" 0 obj\n") buf.WriteString(body) buf.WriteString("\nendobj\n") } pageCount := len(pageTexts) fontObjNum := 3 + (pageCount * 2) writeObj(1, "<>") var kids strings.Builder kids.WriteString("[") for i := range pageCount { if i > 0 { kids.WriteString(" ") } pageObjNum := 3 + (i * 2) kids.WriteString(strconv.Itoa(pageObjNum)) kids.WriteString(" 0 R") } kids.WriteString("]") writeObj(2, fmt.Sprintf("<>", kids.String(), pageCount)) for i, pageText := range pageTexts { pageObjNum := 3 + (i * 2) contentObjNum := pageObjNum + 1 writeObj(pageObjNum, fmt.Sprintf("<>>> /Contents %d 0 R>>", fontObjNum, contentObjNum), ) escaped := escapePDFText(pageText) stream := fmt.Sprintf("BT\n/F1 12 Tf\n72 720 Td\n(%s) Tj\nET\n", escaped) writeObj(contentObjNum, fmt.Sprintf("<>\nstream\n%sendstream", len(stream), stream)) } writeObj(fontObjNum, "<>") xrefOffset := buf.Len() buf.WriteString("xref\n") fmt.Fprintf(buf, "0 %d\n", len(offsets)) buf.WriteString("0000000000 65535 f \n") for i := 1; i < len(offsets); i++ { fmt.Fprintf(buf, "%010d 00000 n \n", offsets[i]) } buf.WriteString("trailer\n") fmt.Fprintf(buf, "<>\n", len(offsets)) buf.WriteString("startxref\n") fmt.Fprintf(buf, "%d\n", xrefOffset) buf.WriteString("%%EOF") return buf.Bytes() } func escapePDFText(s string) string { s = strings.ReplaceAll(s, "\\", "\\\\") s = strings.ReplaceAll(s, "(", "\\(") s = strings.ReplaceAll(s, ")", "\\)") s = strings.ReplaceAll(s, "\n", " ") s = strings.ReplaceAll(s, "\r", " ") return s }