diff --git a/packages/sdk/server-ai/__tests__/Judge.test.ts b/packages/sdk/server-ai/__tests__/Judge.test.ts index ee9ff0351f..43ea75e0ab 100644 --- a/packages/sdk/server-ai/__tests__/Judge.test.ts +++ b/packages/sdk/server-ai/__tests__/Judge.test.ts @@ -70,14 +70,12 @@ describe('Judge', () => { it('evaluates AI response successfully', async () => { const mockStructuredResponse: StructuredResponse = { data: { - evaluations: { - relevance: { score: 0.8, reasoning: 'The response is relevant to the question' }, - }, + score: 0.8, + reasoning: 'The response is relevant to the question', }, rawResponse: JSON.stringify({ - evaluations: { - relevance: { score: 0.8, reasoning: 'The response is relevant to the question' }, - }, + score: 0.8, + reasoning: 'The response is relevant to the question', }), metrics: { success: true, @@ -125,14 +123,12 @@ describe('Judge', () => { it('returns evaluation result with correct evaluationMetricKey for tracker integration', async () => { const mockStructuredResponse: StructuredResponse = { data: { - evaluations: { - relevance: { score: 0.85, reasoning: 'Highly relevant response' }, - }, + score: 0.85, + reasoning: 'Highly relevant response', }, rawResponse: JSON.stringify({ - evaluations: { - relevance: { score: 0.85, reasoning: 'Highly relevant response' }, - }, + score: 0.85, + reasoning: 'Highly relevant response', }), metrics: { success: true, @@ -159,14 +155,12 @@ describe('Judge', () => { const mockStructuredResponse: StructuredResponse = { data: { - evaluations: { - relevance: { score: 0.8, reasoning: 'Good' }, - }, + score: 0.8, + reasoning: 'Good', }, rawResponse: JSON.stringify({ - evaluations: { - relevance: { score: 0.8, reasoning: 'Good' }, - }, + score: 0.8, + reasoning: 'Good', }), metrics: { success: true, @@ -237,14 +231,12 @@ describe('Judge', () => { const mockStructuredResponse: StructuredResponse = { data: { - evaluations: { - relevance: { score: 0.8, reasoning: 'The response is relevant' }, - }, + score: 0.8, + reasoning: 'The response is relevant', }, rawResponse: JSON.stringify({ - evaluations: { - relevance: { score: 0.8, reasoning: 'The response is relevant' }, - }, + score: 0.8, + reasoning: 'The response is relevant', }), metrics: { success: true, @@ -277,14 +269,12 @@ describe('Judge', () => { const mockStructuredResponse: StructuredResponse = { data: { - evaluations: { - relevance: { score: 0.8, reasoning: 'The response is relevant' }, - }, + score: 0.8, + reasoning: 'The response is relevant', }, rawResponse: JSON.stringify({ - evaluations: { - relevance: { score: 0.8, reasoning: 'The response is relevant' }, - }, + score: 0.8, + reasoning: 'The response is relevant', }), metrics: { success: true, @@ -317,14 +307,12 @@ describe('Judge', () => { const mockStructuredResponse: StructuredResponse = { data: { - evaluations: { - relevance: { score: 0.8, reasoning: 'The response is relevant' }, - }, + score: 0.8, + reasoning: 'The response is relevant', }, rawResponse: JSON.stringify({ - evaluations: { - relevance: { score: 0.8, reasoning: 'The response is relevant' }, - }, + score: 0.8, + reasoning: 'The response is relevant', }), metrics: { success: true, @@ -358,14 +346,12 @@ describe('Judge', () => { const mockStructuredResponse: StructuredResponse = { data: { - evaluations: { - helpfulness: { score: 0.7, reasoning: 'The response is helpful' }, - }, + score: 0.7, + reasoning: 'The response is helpful', }, rawResponse: JSON.stringify({ - evaluations: { - helpfulness: { score: 0.7, reasoning: 'The response is helpful' }, - }, + score: 0.7, + reasoning: 'The response is helpful', }), metrics: { success: true, @@ -409,18 +395,10 @@ describe('Judge', () => { ); }); - it('returns result with success false when expected metric is missing', async () => { + it('returns result with success false when response has no score or reasoning', async () => { const mockStructuredResponse: StructuredResponse = { - data: { - evaluations: { - accuracy: { score: 0.9, reasoning: 'Accurate' }, - }, - }, - rawResponse: JSON.stringify({ - evaluations: { - accuracy: { score: 0.9, reasoning: 'Accurate' }, - }, - }), + data: {}, + rawResponse: '{}', metrics: { success: true, usage: { total: 100, input: 50, output: 50 }, @@ -437,19 +415,23 @@ describe('Judge', () => { sampled: true, judgeConfigKey: 'test-judge', }); + expect(mockLogger.warn).toHaveBeenCalledWith( + 'Could not parse evaluation response: {}', + mockTrackData, + ); }); it('returns result with success false when response structure is malformed', async () => { const mockStructuredResponse: StructuredResponse = { data: { - relevance: { score: 0.8, reasoning: 'Good' }, - accuracy: { score: 0.9, reasoning: 'Accurate' }, - helpfulness: { score: 0.7, reasoning: 'Helpful' }, + evaluations: { + relevance: { score: 0.8, reasoning: 'Good' }, + }, }, rawResponse: JSON.stringify({ - relevance: { score: 0.8, reasoning: 'Good' }, - accuracy: { score: 0.9, reasoning: 'Accurate' }, - helpfulness: { score: 0.7, reasoning: 'Helpful' }, + evaluations: { + relevance: { score: 0.8, reasoning: 'Good' }, + }, }), metrics: { success: true, @@ -467,6 +449,10 @@ describe('Judge', () => { sampled: true, judgeConfigKey: 'test-judge', }); + expect(mockLogger.warn).toHaveBeenCalledWith( + expect.stringContaining('Could not parse evaluation response:'), + mockTrackData, + ); }); it('handles provider errors gracefully', async () => { @@ -517,14 +503,12 @@ describe('Judge', () => { const mockStructuredResponse: StructuredResponse = { data: { - evaluations: { - relevance: { score: 0.8, reasoning: 'The response is relevant to the question' }, - }, + score: 0.8, + reasoning: 'The response is relevant to the question', }, rawResponse: JSON.stringify({ - evaluations: { - relevance: { score: 0.8, reasoning: 'The response is relevant to the question' }, - }, + score: 0.8, + reasoning: 'The response is relevant to the question', }), metrics: { success: true, @@ -620,13 +604,9 @@ describe('Judge', () => { it('parses valid evaluation response correctly', () => { // eslint-disable-next-line no-underscore-dangle const parseResponse = (judge as any)._parseEvaluationResponse.bind(judge); - const responseData = { - evaluations: { - relevance: { score: 0.8, reasoning: 'Good' }, - }, - }; + const responseData = { score: 0.8, reasoning: 'Good' }; - const result = parseResponse(responseData, 'relevance', mockTracker); + const result = parseResponse(responseData); expect(result).toEqual({ score: 0.8, @@ -634,28 +614,21 @@ describe('Judge', () => { }); }); - it('returns undefined for invalid response data', () => { + it('returns undefined for empty response data', () => { // eslint-disable-next-line no-underscore-dangle const parseResponse = (judge as any)._parseEvaluationResponse.bind(judge); - const responseData = { - relevance: { score: 0.8, reasoning: 'Good' }, - }; - const result = parseResponse(responseData, 'relevance', mockTracker); + const result = parseResponse({}); expect(result).toBeUndefined(); }); - it('handles missing score or reasoning fields', () => { + it('handles missing reasoning field', () => { // eslint-disable-next-line no-underscore-dangle const parseResponse = (judge as any)._parseEvaluationResponse.bind(judge); - const responseData = { - evaluations: { - relevance: { score: 0.8 }, - }, - }; + const responseData = { score: 0.8 }; - const result = parseResponse(responseData, 'relevance', mockTracker); + const result = parseResponse(responseData); expect(result).toBeUndefined(); }); @@ -663,73 +636,31 @@ describe('Judge', () => { it('handles invalid score values out of range', () => { // eslint-disable-next-line no-underscore-dangle const parseResponse = (judge as any)._parseEvaluationResponse.bind(judge); - const responseData = { - evaluations: { - relevance: { score: 1.5, reasoning: 'Good' }, - }, - }; + const responseData = { score: 1.5, reasoning: 'Good' }; - const result = parseResponse(responseData, 'relevance', mockTracker); + const result = parseResponse(responseData); expect(result).toBeUndefined(); - expect(mockLogger.warn).toHaveBeenCalledWith( - expect.stringContaining('Invalid score evaluated for relevance: 1.5'), - mockTrackData, - ); }); it('handles negative score values', () => { // eslint-disable-next-line no-underscore-dangle const parseResponse = (judge as any)._parseEvaluationResponse.bind(judge); - const responseData = { - evaluations: { - relevance: { score: -0.1, reasoning: 'Good' }, - }, - }; + const responseData = { score: -0.1, reasoning: 'Good' }; - const result = parseResponse(responseData, 'relevance', mockTracker); + const result = parseResponse(responseData); expect(result).toBeUndefined(); - expect(mockLogger.warn).toHaveBeenCalledWith( - expect.stringContaining('Invalid score evaluated for relevance: -0.1'), - mockTrackData, - ); }); it('handles invalid reasoning type', () => { // eslint-disable-next-line no-underscore-dangle const parseResponse = (judge as any)._parseEvaluationResponse.bind(judge); - const responseData = { - evaluations: { - relevance: { score: 0.8, reasoning: 123 }, - }, - }; + const responseData = { score: 0.8, reasoning: 123 }; - const result = parseResponse(responseData, 'relevance', mockTracker); + const result = parseResponse(responseData); expect(result).toBeUndefined(); - expect(mockLogger.warn).toHaveBeenCalledWith( - expect.stringContaining('Invalid reasoning evaluated for relevance: 123'), - mockTrackData, - ); - }); - - it('handles missing evaluation when key does not exist in response', () => { - // eslint-disable-next-line no-underscore-dangle - const parseResponse = (judge as any)._parseEvaluationResponse.bind(judge); - const responseData = { - evaluations: { - accuracy: { score: 0.9, reasoning: 'Accurate' }, - }, - }; - - const result = parseResponse(responseData, 'relevance', mockTracker); - - expect(result).toBeUndefined(); - expect(mockLogger.warn).toHaveBeenCalledWith( - 'Missing evaluation for metric key: relevance', - mockTrackData, - ); }); it('handles empty evaluationMetricKeys array fallback', async () => { @@ -753,41 +684,5 @@ describe('Judge', () => { mockTrackData, ); }); - - it('handles evaluation value that is not an object', () => { - // eslint-disable-next-line no-underscore-dangle - const parseResponse = (judge as any)._parseEvaluationResponse.bind(judge); - const responseData = { - evaluations: { - relevance: 'not an object', - }, - }; - - const result = parseResponse(responseData, 'relevance', mockTracker); - - expect(result).toBeUndefined(); - expect(mockLogger.warn).toHaveBeenCalledWith( - 'Missing evaluation for metric key: relevance', - mockTrackData, - ); - }); - - it('handles null evaluation value', () => { - // eslint-disable-next-line no-underscore-dangle - const parseResponse = (judge as any)._parseEvaluationResponse.bind(judge); - const responseData = { - evaluations: { - relevance: null, - }, - }; - - const result = parseResponse(responseData, 'relevance', mockTracker); - - expect(result).toBeUndefined(); - expect(mockLogger.warn).toHaveBeenCalledWith( - 'Missing evaluation for metric key: relevance', - mockTrackData, - ); - }); }); }); diff --git a/packages/sdk/server-ai/src/api/judge/EvaluationSchemaBuilder.ts b/packages/sdk/server-ai/src/api/judge/EvaluationSchemaBuilder.ts deleted file mode 100644 index 06f745a418..0000000000 --- a/packages/sdk/server-ai/src/api/judge/EvaluationSchemaBuilder.ts +++ /dev/null @@ -1,49 +0,0 @@ -/** - * Internal class for building dynamic evaluation response schemas. - * Not exported - only used internally by TrackedJudge. - */ -class EvaluationSchemaBuilder { - static build(evaluationMetricKey?: string): Record { - if (!evaluationMetricKey) { - return {}; - } - return { - type: 'object', - properties: { - evaluations: { - type: 'object', - description: `Object containing evaluation results for ${evaluationMetricKey} metric`, - properties: { - [evaluationMetricKey]: this._buildKeySchema(evaluationMetricKey), - }, - required: [evaluationMetricKey], - additionalProperties: false, - }, - }, - required: ['evaluations'], - additionalProperties: false, - } as const; - } - - private static _buildKeySchema(key: string) { - return { - type: 'object', - properties: { - score: { - type: 'number', - minimum: 0, - maximum: 1, - description: `Score between 0.0 and 1.0 for ${key}`, - }, - reasoning: { - type: 'string', - description: `Reasoning behind the score for ${key}`, - }, - }, - required: ['score', 'reasoning'], - additionalProperties: false, - }; - } -} - -export { EvaluationSchemaBuilder }; diff --git a/packages/sdk/server-ai/src/api/judge/Judge.ts b/packages/sdk/server-ai/src/api/judge/Judge.ts index e36ab138cd..ef49e3b723 100644 --- a/packages/sdk/server-ai/src/api/judge/Judge.ts +++ b/packages/sdk/server-ai/src/api/judge/Judge.ts @@ -3,12 +3,28 @@ import Mustache from 'mustache'; import { LDLogger } from '@launchdarkly/js-server-sdk-common'; import { ChatResponse } from '../chat/types'; -import { LDAIConfigTracker } from '../config/LDAIConfigTracker'; import { LDAIJudgeConfig, LDMessage } from '../config/types'; import { AIProvider } from '../providers/AIProvider'; -import { EvaluationSchemaBuilder } from './EvaluationSchemaBuilder'; import { LDJudgeResult, StructuredResponse } from './types'; +const EVALUATION_SCHEMA = { + type: 'object', + properties: { + score: { + type: 'number', + minimum: 0, + maximum: 1, + description: 'Score between 0.0 and 1.0.', + }, + reasoning: { + type: 'string', + description: 'Reasoning behind the score.', + }, + }, + required: ['score', 'reasoning'], + additionalProperties: false, +} as const; + /** * Judge implementation that handles evaluation functionality and conversation management. * @@ -17,7 +33,6 @@ import { LDJudgeResult, StructuredResponse } from './types'; */ export class Judge { private readonly _logger?: LDLogger; - private readonly _evaluationResponseStructure: Record; constructor( private readonly _aiConfig: LDAIJudgeConfig, @@ -25,8 +40,6 @@ export class Judge { logger?: LDLogger, ) { this._logger = logger; - const evaluationMetricKey = this._getEvaluationMetricKey(); - this._evaluationResponseStructure = EvaluationSchemaBuilder.build(evaluationMetricKey); } /** @@ -97,14 +110,14 @@ export class Judge { const response = await tracker.trackMetricsOf( (r: StructuredResponse) => r.metrics, - () => this._aiProvider.invokeStructuredModel(messages, this._evaluationResponseStructure), + () => this._aiProvider.invokeStructuredModel(messages, EVALUATION_SCHEMA), ); - const evalResult = this._parseEvaluationResponse(response.data, evaluationMetricKey, tracker); + const evalResult = this._parseEvaluationResponse(response.data); if (!evalResult) { this._logger?.warn( - 'Judge evaluation did not return the expected evaluation', + `Could not parse evaluation response: ${JSON.stringify(response.data)}`, tracker.getTrackData(), ); return result; @@ -181,52 +194,27 @@ export class Judge { } /** - * Parses the structured evaluation response from the AI provider. + * Parses the structured evaluation response. Expects top-level {score, reasoning}. * Returns score and reasoning, or undefined if parsing fails. */ private _parseEvaluationResponse( data: Record, - evaluationMetricKey: string, - tracker: LDAIConfigTracker, ): { score: number; reasoning: string } | undefined { - const evaluations = data.evaluations as Record; - - if (!data.evaluations || typeof data.evaluations !== 'object') { - this._logger?.warn('Invalid response: missing or invalid evaluations object'); - return undefined; - } - - const evaluation = evaluations[evaluationMetricKey]; - - if (!evaluation || typeof evaluation !== 'object') { - this._logger?.warn( - `Missing evaluation for metric key: ${evaluationMetricKey}`, - tracker.getTrackData(), - ); + if (!data || typeof data !== 'object' || Array.isArray(data)) { return undefined; } - const evalData = evaluation as Record; - - if (typeof evalData.score !== 'number' || evalData.score < 0 || evalData.score > 1) { - this._logger?.warn( - `Invalid score evaluated for ${evaluationMetricKey}: ${evalData.score}. Score must be a number between 0 and 1 inclusive`, - tracker.getTrackData(), - ); + if (typeof data.score !== 'number' || data.score < 0 || data.score > 1) { return undefined; } - if (typeof evalData.reasoning !== 'string') { - this._logger?.warn( - `Invalid reasoning evaluated for ${evaluationMetricKey}: ${evalData.reasoning}. Reasoning must be a string`, - tracker.getTrackData(), - ); + if (typeof data.reasoning !== 'string') { return undefined; } return { - score: evalData.score, - reasoning: evalData.reasoning, + score: data.score, + reasoning: data.reasoning, }; } }