Skip to content
Merged
Show file tree
Hide file tree
Changes from 3 commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
227 changes: 60 additions & 167 deletions packages/sdk/server-ai/__tests__/Judge.test.ts
Original file line number Diff line number Diff line change
Expand Up @@ -70,14 +70,12 @@ describe('Judge', () => {
it('evaluates AI response successfully', async () => {
const mockStructuredResponse: StructuredResponse = {
data: {
evaluations: {
relevance: { score: 0.8, reasoning: 'The response is relevant to the question' },
},
score: 0.8,
reasoning: 'The response is relevant to the question',
},
rawResponse: JSON.stringify({
evaluations: {
relevance: { score: 0.8, reasoning: 'The response is relevant to the question' },
},
score: 0.8,
reasoning: 'The response is relevant to the question',
}),
metrics: {
success: true,
Expand Down Expand Up @@ -125,14 +123,12 @@ describe('Judge', () => {
it('returns evaluation result with correct evaluationMetricKey for tracker integration', async () => {
const mockStructuredResponse: StructuredResponse = {
data: {
evaluations: {
relevance: { score: 0.85, reasoning: 'Highly relevant response' },
},
score: 0.85,
reasoning: 'Highly relevant response',
},
rawResponse: JSON.stringify({
evaluations: {
relevance: { score: 0.85, reasoning: 'Highly relevant response' },
},
score: 0.85,
reasoning: 'Highly relevant response',
}),
metrics: {
success: true,
Expand All @@ -159,14 +155,12 @@ describe('Judge', () => {

const mockStructuredResponse: StructuredResponse = {
data: {
evaluations: {
relevance: { score: 0.8, reasoning: 'Good' },
},
score: 0.8,
reasoning: 'Good',
},
rawResponse: JSON.stringify({
evaluations: {
relevance: { score: 0.8, reasoning: 'Good' },
},
score: 0.8,
reasoning: 'Good',
}),
metrics: {
success: true,
Expand Down Expand Up @@ -237,14 +231,12 @@ describe('Judge', () => {

const mockStructuredResponse: StructuredResponse = {
data: {
evaluations: {
relevance: { score: 0.8, reasoning: 'The response is relevant' },
},
score: 0.8,
reasoning: 'The response is relevant',
},
rawResponse: JSON.stringify({
evaluations: {
relevance: { score: 0.8, reasoning: 'The response is relevant' },
},
score: 0.8,
reasoning: 'The response is relevant',
}),
metrics: {
success: true,
Expand Down Expand Up @@ -277,14 +269,12 @@ describe('Judge', () => {

const mockStructuredResponse: StructuredResponse = {
data: {
evaluations: {
relevance: { score: 0.8, reasoning: 'The response is relevant' },
},
score: 0.8,
reasoning: 'The response is relevant',
},
rawResponse: JSON.stringify({
evaluations: {
relevance: { score: 0.8, reasoning: 'The response is relevant' },
},
score: 0.8,
reasoning: 'The response is relevant',
}),
metrics: {
success: true,
Expand Down Expand Up @@ -317,14 +307,12 @@ describe('Judge', () => {

const mockStructuredResponse: StructuredResponse = {
data: {
evaluations: {
relevance: { score: 0.8, reasoning: 'The response is relevant' },
},
score: 0.8,
reasoning: 'The response is relevant',
},
rawResponse: JSON.stringify({
evaluations: {
relevance: { score: 0.8, reasoning: 'The response is relevant' },
},
score: 0.8,
reasoning: 'The response is relevant',
}),
metrics: {
success: true,
Expand Down Expand Up @@ -358,14 +346,12 @@ describe('Judge', () => {

const mockStructuredResponse: StructuredResponse = {
data: {
evaluations: {
helpfulness: { score: 0.7, reasoning: 'The response is helpful' },
},
score: 0.7,
reasoning: 'The response is helpful',
},
rawResponse: JSON.stringify({
evaluations: {
helpfulness: { score: 0.7, reasoning: 'The response is helpful' },
},
score: 0.7,
reasoning: 'The response is helpful',
}),
metrics: {
success: true,
Expand Down Expand Up @@ -409,18 +395,10 @@ describe('Judge', () => {
);
});

it('returns result with success false when expected metric is missing', async () => {
it('returns result with success false when response has no score or reasoning', async () => {
const mockStructuredResponse: StructuredResponse = {
data: {
evaluations: {
accuracy: { score: 0.9, reasoning: 'Accurate' },
},
},
rawResponse: JSON.stringify({
evaluations: {
accuracy: { score: 0.9, reasoning: 'Accurate' },
},
}),
data: {},
rawResponse: '{}',
metrics: {
success: true,
usage: { total: 100, input: 50, output: 50 },
Expand All @@ -437,19 +415,22 @@ describe('Judge', () => {
sampled: true,
judgeConfigKey: 'test-judge',
});
expect(mockLogger.warn).toHaveBeenCalledWith(
'Could not parse evaluation response for judge "test-judge": {}',
);
});

it('returns result with success false when response structure is malformed', async () => {
const mockStructuredResponse: StructuredResponse = {
data: {
relevance: { score: 0.8, reasoning: 'Good' },
accuracy: { score: 0.9, reasoning: 'Accurate' },
helpfulness: { score: 0.7, reasoning: 'Helpful' },
evaluations: {
relevance: { score: 0.8, reasoning: 'Good' },
},
},
rawResponse: JSON.stringify({
relevance: { score: 0.8, reasoning: 'Good' },
accuracy: { score: 0.9, reasoning: 'Accurate' },
helpfulness: { score: 0.7, reasoning: 'Helpful' },
evaluations: {
relevance: { score: 0.8, reasoning: 'Good' },
},
}),
metrics: {
success: true,
Expand All @@ -467,6 +448,9 @@ describe('Judge', () => {
sampled: true,
judgeConfigKey: 'test-judge',
});
expect(mockLogger.warn).toHaveBeenCalledWith(
expect.stringContaining('Could not parse evaluation response for judge "test-judge"'),
);
});

it('handles provider errors gracefully', async () => {
Expand Down Expand Up @@ -517,14 +501,12 @@ describe('Judge', () => {

const mockStructuredResponse: StructuredResponse = {
data: {
evaluations: {
relevance: { score: 0.8, reasoning: 'The response is relevant to the question' },
},
score: 0.8,
reasoning: 'The response is relevant to the question',
},
rawResponse: JSON.stringify({
evaluations: {
relevance: { score: 0.8, reasoning: 'The response is relevant to the question' },
},
score: 0.8,
reasoning: 'The response is relevant to the question',
}),
metrics: {
success: true,
Expand Down Expand Up @@ -620,116 +602,63 @@ describe('Judge', () => {
it('parses valid evaluation response correctly', () => {
// eslint-disable-next-line no-underscore-dangle
const parseResponse = (judge as any)._parseEvaluationResponse.bind(judge);
const responseData = {
evaluations: {
relevance: { score: 0.8, reasoning: 'Good' },
},
};
const responseData = { score: 0.8, reasoning: 'Good' };

const result = parseResponse(responseData, 'relevance', mockTracker);
const result = parseResponse(responseData);

expect(result).toEqual({
score: 0.8,
reasoning: 'Good',
});
});

it('returns undefined for invalid response data', () => {
it('returns undefined for empty response data', () => {
// eslint-disable-next-line no-underscore-dangle
const parseResponse = (judge as any)._parseEvaluationResponse.bind(judge);
const responseData = {
relevance: { score: 0.8, reasoning: 'Good' },
};

const result = parseResponse(responseData, 'relevance', mockTracker);
const result = parseResponse({});

expect(result).toBeUndefined();
});

it('handles missing score or reasoning fields', () => {
it('handles missing reasoning field', () => {
// eslint-disable-next-line no-underscore-dangle
const parseResponse = (judge as any)._parseEvaluationResponse.bind(judge);
const responseData = {
evaluations: {
relevance: { score: 0.8 },
},
};
const responseData = { score: 0.8 };

const result = parseResponse(responseData, 'relevance', mockTracker);
const result = parseResponse(responseData);

expect(result).toBeUndefined();
});

it('handles invalid score values out of range', () => {
// eslint-disable-next-line no-underscore-dangle
const parseResponse = (judge as any)._parseEvaluationResponse.bind(judge);
const responseData = {
evaluations: {
relevance: { score: 1.5, reasoning: 'Good' },
},
};
const responseData = { score: 1.5, reasoning: 'Good' };

const result = parseResponse(responseData, 'relevance', mockTracker);
const result = parseResponse(responseData);

expect(result).toBeUndefined();
expect(mockLogger.warn).toHaveBeenCalledWith(
expect.stringContaining('Invalid score evaluated for relevance: 1.5'),
mockTrackData,
);
});

it('handles negative score values', () => {
// eslint-disable-next-line no-underscore-dangle
const parseResponse = (judge as any)._parseEvaluationResponse.bind(judge);
const responseData = {
evaluations: {
relevance: { score: -0.1, reasoning: 'Good' },
},
};
const responseData = { score: -0.1, reasoning: 'Good' };

const result = parseResponse(responseData, 'relevance', mockTracker);
const result = parseResponse(responseData);

expect(result).toBeUndefined();
expect(mockLogger.warn).toHaveBeenCalledWith(
expect.stringContaining('Invalid score evaluated for relevance: -0.1'),
mockTrackData,
);
});

it('handles invalid reasoning type', () => {
// eslint-disable-next-line no-underscore-dangle
const parseResponse = (judge as any)._parseEvaluationResponse.bind(judge);
const responseData = {
evaluations: {
relevance: { score: 0.8, reasoning: 123 },
},
};

const result = parseResponse(responseData, 'relevance', mockTracker);

expect(result).toBeUndefined();
expect(mockLogger.warn).toHaveBeenCalledWith(
expect.stringContaining('Invalid reasoning evaluated for relevance: 123'),
mockTrackData,
);
});

it('handles missing evaluation when key does not exist in response', () => {
// eslint-disable-next-line no-underscore-dangle
const parseResponse = (judge as any)._parseEvaluationResponse.bind(judge);
const responseData = {
evaluations: {
accuracy: { score: 0.9, reasoning: 'Accurate' },
},
};
const responseData = { score: 0.8, reasoning: 123 };

const result = parseResponse(responseData, 'relevance', mockTracker);
const result = parseResponse(responseData);

expect(result).toBeUndefined();
expect(mockLogger.warn).toHaveBeenCalledWith(
'Missing evaluation for metric key: relevance',
mockTrackData,
);
});

it('handles empty evaluationMetricKeys array fallback', async () => {
Expand All @@ -753,41 +682,5 @@ describe('Judge', () => {
mockTrackData,
);
});

it('handles evaluation value that is not an object', () => {
// eslint-disable-next-line no-underscore-dangle
const parseResponse = (judge as any)._parseEvaluationResponse.bind(judge);
const responseData = {
evaluations: {
relevance: 'not an object',
},
};

const result = parseResponse(responseData, 'relevance', mockTracker);

expect(result).toBeUndefined();
expect(mockLogger.warn).toHaveBeenCalledWith(
'Missing evaluation for metric key: relevance',
mockTrackData,
);
});

it('handles null evaluation value', () => {
// eslint-disable-next-line no-underscore-dangle
const parseResponse = (judge as any)._parseEvaluationResponse.bind(judge);
const responseData = {
evaluations: {
relevance: null,
},
};

const result = parseResponse(responseData, 'relevance', mockTracker);

expect(result).toBeUndefined();
expect(mockLogger.warn).toHaveBeenCalledWith(
'Missing evaluation for metric key: relevance',
mockTrackData,
);
});
});
});
Loading
Loading