test: added live openai tests
This commit is contained in:
parent
9dafd16646
commit
457f1a9481
@ -1341,6 +1341,7 @@ _Document results here as tests are completed_
|
|||||||
| 2.3.x | ✅ PASSED | 2026-04-07 | OpenCode | 2-minute timeout behavior and network failure handling validated |
|
| 2.3.x | ✅ PASSED | 2026-04-07 | OpenCode | 2-minute timeout behavior and network failure handling validated |
|
||||||
| 2.4.x | ✅ PASSED | 2026-04-07 | OpenCode | JSON parsing success/failure/empty choices/missing fields covered |
|
| 2.4.x | ✅ PASSED | 2026-04-07 | OpenCode | JSON parsing success/failure/empty choices/missing fields covered |
|
||||||
| 2.5.x | ✅ PASSED | 2026-04-07 | OpenCode | Upstream 500/429 and malformed content error handling validated |
|
| 2.5.x | ✅ PASSED | 2026-04-07 | OpenCode | Upstream 500/429 and malformed content error handling validated |
|
||||||
|
| 2.x live consistency | ⏭️ READY (manual run) | 2026-04-07 | OpenCode | Added opt-in live test `TestCallLLM_Live_ScoreConsistencyPlusMinus10` gated by `RUN_LIVE_OPENAI_TESTS=1` |
|
||||||
| 3.1.x | ✅ PASSED | 2026-04-07 | OpenCode | Overall score range and consistency scenarios covered with deterministic mocks |
|
| 3.1.x | ✅ PASSED | 2026-04-07 | OpenCode | Overall score range and consistency scenarios covered with deterministic mocks |
|
||||||
| 3.2.x | ✅ PASSED | 2026-04-07 | OpenCode | Criteria population, score bounds, evidence/comments presence verified |
|
| 3.2.x | ✅ PASSED | 2026-04-07 | OpenCode | Criteria population, score bounds, evidence/comments presence verified |
|
||||||
| 3.3.x | ✅ PASSED | 2026-04-07 | OpenCode | Strengths/weaknesses/missing information structure checks validated |
|
| 3.3.x | ✅ PASSED | 2026-04-07 | OpenCode | Strengths/weaknesses/missing information structure checks validated |
|
||||||
@ -1397,7 +1398,7 @@ _Document any test failures here with details_
|
|||||||
|
|
||||||
**Next Steps:**
|
**Next Steps:**
|
||||||
1. Add CI step to run `go test ./...` before image build/push
|
1. Add CI step to run `go test ./...` before image build/push
|
||||||
2. Tighten manual validation notes for production-like OpenAI calls
|
2. Run opt-in live consistency test (`RUN_LIVE_OPENAI_TESTS=1`) and log observed score distribution
|
||||||
3. Backfill health-check endpoint or document non-AI endpoint strategy
|
3. Backfill health-check endpoint or document non-AI endpoint strategy
|
||||||
4. Keep the intentional 1.2.6 skip documented until encrypted-PDF fixtures are added
|
4. Keep the intentional 1.2.6 skip documented until encrypted-PDF fixtures are added
|
||||||
|
|
||||||
|
|||||||
77
internal/services/analyzer_live_test.go
Normal file
77
internal/services/analyzer_live_test.go
Normal file
@ -0,0 +1,77 @@
|
|||||||
|
package services
|
||||||
|
|
||||||
|
import (
|
||||||
|
"math"
|
||||||
|
"os"
|
||||||
|
"testing"
|
||||||
|
"time"
|
||||||
|
)
|
||||||
|
|
||||||
|
// TestCallLLM_Live_ScoreConsistencyPlusMinus10 validates SRD_NonFuncReq_0006 /
|
||||||
|
// SRD_QualAssurReq_0001 against the live OpenAI API.
|
||||||
|
//
|
||||||
|
// This test is opt-in to avoid API cost/flakiness in default runs:
|
||||||
|
// RUN_LIVE_OPENAI_TESTS=1 OPENAI_API_KEY=... go test ./internal/services -run TestCallLLM_Live_ScoreConsistencyPlusMinus10 -v
|
||||||
|
func TestCallLLM_Live_ScoreConsistencyPlusMinus10(t *testing.T) {
|
||||||
|
if os.Getenv("RUN_LIVE_OPENAI_TESTS") != "1" {
|
||||||
|
t.Skip("set RUN_LIVE_OPENAI_TESTS=1 to run live OpenAI consistency test")
|
||||||
|
}
|
||||||
|
if os.Getenv("OPENAI_API_KEY") == "" {
|
||||||
|
t.Skip("OPENAI_API_KEY is required for live OpenAI test")
|
||||||
|
}
|
||||||
|
|
||||||
|
resume := `Senior Software Engineer with 7 years of experience building Go backend services.
|
||||||
|
Led microservice migrations, improved API latency by 35%, and maintained CI/CD pipelines.
|
||||||
|
Experience includes Kubernetes, Docker, PostgreSQL, and cloud deployments on AWS.`
|
||||||
|
|
||||||
|
job := `We are hiring a Senior Go Backend Engineer with strong API design skills,
|
||||||
|
production Kubernetes experience, and ownership of scalable distributed systems.
|
||||||
|
Candidates should demonstrate measurable impact, collaboration, and code quality.`
|
||||||
|
|
||||||
|
const runs = 10
|
||||||
|
scores := make([]int, 0, runs)
|
||||||
|
|
||||||
|
for i := range runs {
|
||||||
|
result, err := callLLM(resume, job)
|
||||||
|
if err != nil {
|
||||||
|
t.Fatalf("run %d failed: %v", i+1, err)
|
||||||
|
}
|
||||||
|
if result.OverallScore < 0 || result.OverallScore > 100 {
|
||||||
|
t.Fatalf("run %d produced out-of-range score: %d", i+1, result.OverallScore)
|
||||||
|
}
|
||||||
|
|
||||||
|
scores = append(scores, result.OverallScore)
|
||||||
|
t.Logf("run %d score: %d", i+1, result.OverallScore)
|
||||||
|
|
||||||
|
if i < runs-1 {
|
||||||
|
time.Sleep(300 * time.Millisecond)
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
baseline := scores[0]
|
||||||
|
for i, score := range scores {
|
||||||
|
delta := score - baseline
|
||||||
|
if delta < 0 {
|
||||||
|
delta = -delta
|
||||||
|
}
|
||||||
|
if delta > 10 {
|
||||||
|
t.Fatalf("run %d score %d exceeded +/-10 bound from baseline %d", i+1, score, baseline)
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
var sum float64
|
||||||
|
for _, s := range scores {
|
||||||
|
sum += float64(s)
|
||||||
|
}
|
||||||
|
mean := sum / float64(len(scores))
|
||||||
|
|
||||||
|
var variance float64
|
||||||
|
for _, s := range scores {
|
||||||
|
d := float64(s) - mean
|
||||||
|
variance += d * d
|
||||||
|
}
|
||||||
|
variance /= float64(len(scores))
|
||||||
|
stddev := math.Sqrt(variance)
|
||||||
|
|
||||||
|
t.Logf("baseline=%d scores=%v mean=%.2f stddev=%.2f", baseline, scores, mean, stddev)
|
||||||
|
}
|
||||||
Loading…
x
Reference in New Issue
Block a user