diff --git a/backend/.alembic/versions/b2c3d4e5f6a7_congressional_record.py b/backend/.alembic/versions/b2c3d4e5f6a7_congressional_record.py new file mode 100644 index 0000000..9ac3fee --- /dev/null +++ b/backend/.alembic/versions/b2c3d4e5f6a7_congressional_record.py @@ -0,0 +1,104 @@ +"""Add Congressional Record tables + +Revision ID: b2c3d4e5f6a7 +Revises: a1b2c3d4e5f6 +Create Date: 2026-03-24 00:00:00.000000 + +""" +from typing import Sequence, Union + +from alembic import op +import sqlalchemy as sa +from sqlalchemy.dialects.postgresql import ENUM + +revision: str = 'b2c3d4e5f6a7' +down_revision: Union[str, None] = 'a1b2c3d4e5f6' +branch_labels: Union[str, Sequence[str], None] = None +depends_on: Union[str, Sequence[str], None] = None + + +def upgrade() -> None: + # Create CRECSection enum type only if it doesn't exist + op.execute("DO $$ BEGIN IF NOT EXISTS (SELECT 1 FROM pg_type WHERE typname = 'crecsection') THEN CREATE TYPE crecsection AS ENUM ('Senate', 'House', 'Extensions', 'DailyDigest'); END IF; END $$;") + + # Use create_type=False so SQLAlchemy doesn't try to re-create the existing enum + crec_section_ref = ENUM('Senate', 'House', 'Extensions', 'DailyDigest', name='crecsection', create_type=False) + + op.create_table( + 'crec_issue', + sa.Column('crec_issue_id', sa.Integer(), primary_key=True, autoincrement=True), + sa.Column('issue_date', sa.Date(), nullable=True), + sa.Column('congress_id', sa.Integer(), sa.ForeignKey('congress.congress_id', ondelete='CASCADE'), nullable=True), + sa.Column('package_id', sa.String(), nullable=True), + sa.Column('created_at', sa.DateTime(), server_default=sa.func.now()), + ) + op.create_index('ix_crec_issue_issue_date', 'crec_issue', ['issue_date'], unique=True) + op.create_index('ix_crec_issue_congress_id', 'crec_issue', ['congress_id']) + op.create_unique_constraint('uq_crec_issue_package_id', 'crec_issue', ['package_id']) + + op.create_table( + 'crec_granule', + sa.Column('crec_granule_id', sa.Integer(), primary_key=True, autoincrement=True), + sa.Column('crec_issue_id', sa.Integer(), sa.ForeignKey('crec_issue.crec_issue_id', ondelete='CASCADE'), nullable=True), + sa.Column('granule_id', sa.String(), nullable=True), + sa.Column('section', crec_section_ref, nullable=True), + sa.Column('title', sa.String(), nullable=True), + sa.Column('page_start', sa.String(), nullable=True), + sa.Column('page_end', sa.String(), nullable=True), + sa.Column('order_number', sa.Integer(), default=0), + sa.Column('created_at', sa.DateTime(), server_default=sa.func.now()), + ) + op.create_index('ix_crec_granule_crec_issue_id', 'crec_granule', ['crec_issue_id']) + op.create_index('ix_crec_granule_section', 'crec_granule', ['section']) + op.create_unique_constraint('uq_crec_granule_granule_id', 'crec_granule', ['granule_id']) + + op.create_table( + 'crec_speech', + sa.Column('crec_speech_id', sa.Integer(), primary_key=True, autoincrement=True), + sa.Column('crec_granule_id', sa.Integer(), sa.ForeignKey('crec_granule.crec_granule_id', ondelete='CASCADE'), nullable=True), + sa.Column('speaker_raw', sa.String(), nullable=True), + sa.Column('legislator_bioguide_id', sa.String(), sa.ForeignKey('legislator.bioguide_id', ondelete='SET NULL'), nullable=True), + sa.Column('order_number', sa.Integer(), default=0), + sa.Column('content_text', sa.String(), nullable=True), + sa.Column('word_count', sa.Integer(), default=0), + sa.Column('created_at', sa.DateTime(), server_default=sa.func.now()), + ) + op.create_index('ix_crec_speech_crec_granule_id', 'crec_speech', ['crec_granule_id']) + op.create_index('ix_crec_speech_legislator_bioguide_id', 'crec_speech', ['legislator_bioguide_id']) + + op.create_table( + 'crec_bill_reference', + sa.Column('crec_bill_reference_id', sa.Integer(), primary_key=True, autoincrement=True), + sa.Column('crec_speech_id', sa.Integer(), sa.ForeignKey('crec_speech.crec_speech_id', ondelete='CASCADE'), nullable=True), + sa.Column('legislation_id', sa.Integer(), sa.ForeignKey('legislation.legislation_id', ondelete='SET NULL'), nullable=True), + sa.Column('cite_text', sa.String(), nullable=True), + sa.Column('cite_type', sa.String(), nullable=True), + sa.Column('start_offset', sa.Integer(), nullable=True), + sa.Column('end_offset', sa.Integer(), nullable=True), + ) + op.create_index('ix_crec_bill_reference_crec_speech_id', 'crec_bill_reference', ['crec_speech_id']) + op.create_index('ix_crec_bill_reference_legislation_id', 'crec_bill_reference', ['legislation_id']) + + op.create_table( + 'crec_summary', + sa.Column('crec_summary_id', sa.Integer(), primary_key=True, autoincrement=True), + sa.Column('crec_granule_id', sa.Integer(), sa.ForeignKey('crec_granule.crec_granule_id', ondelete='CASCADE'), nullable=True), + sa.Column('crec_issue_id', sa.Integer(), sa.ForeignKey('crec_issue.crec_issue_id', ondelete='CASCADE'), nullable=True), + sa.Column('summary', sa.String(), nullable=True), + sa.Column('summary_type', sa.String(), nullable=True), + sa.Column('prompt_batch_id', sa.Integer(), sa.ForeignKey('prompts.prompt_batch.prompt_batch_id', ondelete='CASCADE'), nullable=True), + sa.Column('created_at', sa.DateTime(), server_default=sa.func.now()), + schema='prompts', + ) + op.create_index('ix_crec_summary_crec_granule_id', 'crec_summary', ['crec_granule_id'], schema='prompts') + op.create_index('ix_crec_summary_crec_issue_id', 'crec_summary', ['crec_issue_id'], schema='prompts') + op.create_index('ix_crec_summary_prompt_batch_id', 'crec_summary', ['prompt_batch_id'], schema='prompts') + + +def downgrade() -> None: + op.drop_table('crec_summary', schema='prompts') + op.drop_table('crec_bill_reference') + op.drop_table('crec_speech') + op.drop_table('crec_granule') + op.drop_table('crec_issue') + sa.Enum(name='crecsection').drop(op.get_bind(), checkfirst=True) diff --git a/backend/congress_db/models.py b/backend/congress_db/models.py index fb2ca6d..c9df9a9 100644 --- a/backend/congress_db/models.py +++ b/backend/congress_db/models.py @@ -45,6 +45,13 @@ def bind_expression(self, bindvalue): return sa.cast(bindvalue, self) +class CRECSection(str, enum.Enum): + Senate = "Senate" + House = "House" + Extensions = "Extensions" + DailyDigest = "DailyDigest" + + class LegislatorJob(str, enum.Enum): Senator = "Senator" Representative = "Representative" @@ -1262,6 +1269,137 @@ class Appropriation(AppropriationsBase): purpose = Column(String, default="") +class CRECIssue(Base): + """ + One row per daily Congressional Record issue + """ + + __tablename__ = "crec_issue" + + crec_issue_id = Column(Integer, primary_key=True) + issue_date = Column(Date, unique=True, index=True) + congress_id = Column( + Integer, ForeignKey("congress.congress_id", ondelete="CASCADE"), index=True + ) + package_id = Column(String, unique=True) + created_at = Column(DateTime(timezone=False), server_default=func.now()) + + granules = relationship("CRECGranule", back_populates="issue") + + +class CRECGranule(Base): + """ + A discrete item/debate/segment within a daily Congressional Record issue + """ + + __tablename__ = "crec_granule" + + crec_granule_id = Column(Integer, primary_key=True) + crec_issue_id = Column( + Integer, + ForeignKey("crec_issue.crec_issue_id", ondelete="CASCADE"), + index=True, + ) + granule_id = Column(String, unique=True) + section = Column(Enum(CRECSection), index=True) + title = Column(String) + page_start = Column(String, nullable=True) + page_end = Column(String, nullable=True) + order_number = Column(Integer, default=0) + created_at = Column(DateTime(timezone=False), server_default=func.now()) + + issue = relationship("CRECIssue", back_populates="granules") + speeches = relationship("CRECSpeech", back_populates="granule") + + +class CRECSpeech(Base): + """ + Individual speech segment within a granule, with speaker attribution + """ + + __tablename__ = "crec_speech" + + crec_speech_id = Column(Integer, primary_key=True) + crec_granule_id = Column( + Integer, + ForeignKey("crec_granule.crec_granule_id", ondelete="CASCADE"), + index=True, + ) + speaker_raw = Column(String, nullable=True) + legislator_bioguide_id = Column( + String, + ForeignKey("legislator.bioguide_id", ondelete="SET NULL"), + index=True, + nullable=True, + ) + order_number = Column(Integer, default=0) + content_text = Column(String) + word_count = Column(Integer, default=0) + created_at = Column(DateTime(timezone=False), server_default=func.now()) + + granule = relationship("CRECGranule", back_populates="speeches") + bill_references = relationship("CRECBillReference", back_populates="speech") + + +class CRECBillReference(Base): + """ + Bill citations found within speech text in the Congressional Record + """ + + __tablename__ = "crec_bill_reference" + + crec_bill_reference_id = Column(Integer, primary_key=True) + crec_speech_id = Column( + Integer, + ForeignKey("crec_speech.crec_speech_id", ondelete="CASCADE"), + index=True, + ) + legislation_id = Column( + Integer, + ForeignKey("legislation.legislation_id", ondelete="SET NULL"), + index=True, + nullable=True, + ) + cite_text = Column(String) + cite_type = Column(String) + start_offset = Column(Integer) + end_offset = Column(Integer) + + speech = relationship("CRECSpeech", back_populates="bill_references") + + +class CRECSummary(PromptsBase): + """ + LLM-generated summaries of Congressional Record debates + """ + + __tablename__ = "crec_summary" + __table_args__ = {"schema": "prompts"} + + crec_summary_id = Column(Integer, primary_key=True) + crec_granule_id = Column( + Integer, + ForeignKey("crec_granule.crec_granule_id", ondelete="CASCADE"), + index=True, + nullable=True, + ) + crec_issue_id = Column( + Integer, + ForeignKey("crec_issue.crec_issue_id", ondelete="CASCADE"), + index=True, + nullable=True, + ) + summary = Column(String) + summary_type = Column(String) + prompt_batch_id = Column( + Integer, + ForeignKey(PromptBatch.prompt_batch_id, ondelete="CASCADE"), + index=True, + nullable=True, + ) + created_at = Column(DateTime(timezone=False), server_default=func.now()) + + class User(AuthenticationBase): __tablename__ = "user" diff --git a/backend/congress_fastapi/app.py b/backend/congress_fastapi/app.py index 120aa99..450a954 100644 --- a/backend/congress_fastapi/app.py +++ b/backend/congress_fastapi/app.py @@ -13,6 +13,7 @@ from congress_fastapi.routes.stats import router as stats_router from congress_fastapi.routes.uscode import router as uscode_router from congress_fastapi.routes.committees import router as committees_router +from congress_fastapi.routes.congressional_record import router as crec_router from congress_fastapi.utils.limiter import limiter from slowapi import _rate_limit_exceeded_handler from slowapi.errors import RateLimitExceeded @@ -56,4 +57,5 @@ async def log_exceptions_middleware(request: Request, call_next): app.include_router(stats_router) app.include_router(uscode_router) app.include_router(committees_router) +app.include_router(crec_router) print("Loaded") diff --git a/backend/congress_fastapi/handlers/congressional_record.py b/backend/congress_fastapi/handlers/congressional_record.py new file mode 100644 index 0000000..e6c4c6a --- /dev/null +++ b/backend/congress_fastapi/handlers/congressional_record.py @@ -0,0 +1,233 @@ +from typing import List, Optional, Tuple +from datetime import date + +from sqlalchemy import select, func, desc, and_ + +from congress_db.models import ( + CRECIssue, + CRECGranule, + CRECSpeech, + CRECBillReference, + CRECSummary, + Legislator, +) +from congress_fastapi.db.postgres import get_database +from congress_fastapi.models.congressional_record import ( + CRECIssueInfo, + CRECGranuleInfo, + CRECSpeechDetailInfo, + CRECBillReferenceInfo, + CRECSpeakerStatInfo, + CRECActivityItem, +) + + +async def get_issues( + start_date: Optional[date], + end_date: Optional[date], + chamber: Optional[str], + page: int = 1, + page_size: int = 20, +) -> Tuple[List[CRECIssueInfo], int]: + database = await get_database() + + query = select(*CRECIssueInfo.sqlalchemy_columns()).select_from(CRECIssue) + count_query = select(func.count(CRECIssue.crec_issue_id)) + + if start_date: + query = query.where(CRECIssue.issue_date >= start_date) + count_query = count_query.where(CRECIssue.issue_date >= start_date) + if end_date: + query = query.where(CRECIssue.issue_date <= end_date) + count_query = count_query.where(CRECIssue.issue_date <= end_date) + + total = await database.fetch_val(count_query) + + query = query.order_by(desc(CRECIssue.issue_date)) + query = query.limit(page_size).offset((page - 1) * page_size) + + results = await database.fetch_all(query) + return [CRECIssueInfo(**r) for r in results], total + + +async def get_issue_detail(issue_id: int): + database = await get_database() + + issue_query = select(*CRECIssueInfo.sqlalchemy_columns()).where( + CRECIssue.crec_issue_id == issue_id + ) + issue = await database.fetch_one(issue_query) + if not issue: + return None, [] + + granules_query = ( + select(*CRECGranuleInfo.sqlalchemy_columns()) + .where(CRECGranule.crec_issue_id == issue_id) + .order_by(CRECGranule.order_number) + ) + granules = await database.fetch_all(granules_query) + + return CRECIssueInfo(**issue), [CRECGranuleInfo(**g) for g in granules] + + +async def get_granule_detail(granule_id: int): + database = await get_database() + + granule_query = select(*CRECGranuleInfo.sqlalchemy_columns()).where( + CRECGranule.crec_granule_id == granule_id + ) + granule = await database.fetch_one(granule_query) + if not granule: + return None, [], None + + speeches_query = ( + select( + CRECSpeech.crec_speech_id, + CRECSpeech.speaker_raw, + CRECSpeech.legislator_bioguide_id, + CRECSpeech.order_number, + CRECSpeech.content_text, + CRECSpeech.word_count, + ) + .where(CRECSpeech.crec_granule_id == granule_id) + .order_by(CRECSpeech.order_number) + ) + speech_rows = await database.fetch_all(speeches_query) + + speeches = [] + for row in speech_rows: + refs_query = ( + select( + CRECBillReference.crec_bill_reference_id, + CRECBillReference.legislation_id, + CRECBillReference.cite_text, + CRECBillReference.cite_type, + CRECBillReference.start_offset, + CRECBillReference.end_offset, + ) + .where(CRECBillReference.crec_speech_id == row["crec_speech_id"]) + ) + ref_rows = await database.fetch_all(refs_query) + + speech = CRECSpeechDetailInfo( + crec_speech_id=row["crec_speech_id"], + speaker_raw=row["speaker_raw"], + legislator_bioguide_id=row["legislator_bioguide_id"], + order_number=row["order_number"], + content_text=row["content_text"], + word_count=row["word_count"], + bill_references=[CRECBillReferenceInfo(**r) for r in ref_rows], + ) + speeches.append(speech) + + summary_query = ( + select(CRECSummary.summary) + .where( + CRECSummary.crec_granule_id == granule_id, + CRECSummary.summary_type == "granule", + ) + .limit(1) + ) + summary_row = await database.fetch_one(summary_query) + summary = summary_row["summary"] if summary_row else None + + return CRECGranuleInfo(**granule), speeches, summary + + +async def get_speaker_stats( + start_date: Optional[date], + end_date: Optional[date], + chamber: Optional[str], + limit: int = 20, + page: int = 1, +) -> Tuple[List[CRECSpeakerStatInfo], int]: + database = await get_database() + + conditions = [CRECSpeech.legislator_bioguide_id.isnot(None)] + + if start_date or end_date or chamber: + join_conditions = [] + if start_date: + join_conditions.append(CRECIssue.issue_date >= start_date) + if end_date: + join_conditions.append(CRECIssue.issue_date <= end_date) + + query = ( + select( + CRECSpeech.legislator_bioguide_id.label("bioguide_id"), + Legislator.first_name, + Legislator.last_name, + Legislator.party, + Legislator.state, + Legislator.image_url, + func.sum(CRECSpeech.word_count).label("total_words"), + func.count(CRECSpeech.crec_speech_id).label("speech_count"), + ) + .join(CRECGranule, CRECSpeech.crec_granule_id == CRECGranule.crec_granule_id) + .join(CRECIssue, CRECGranule.crec_issue_id == CRECIssue.crec_issue_id) + .join(Legislator, CRECSpeech.legislator_bioguide_id == Legislator.bioguide_id) + .where(and_(*conditions, *join_conditions)) + ) + + if chamber: + query = query.where(CRECGranule.section == chamber) + else: + query = ( + select( + CRECSpeech.legislator_bioguide_id.label("bioguide_id"), + Legislator.first_name, + Legislator.last_name, + Legislator.party, + Legislator.state, + Legislator.image_url, + func.sum(CRECSpeech.word_count).label("total_words"), + func.count(CRECSpeech.crec_speech_id).label("speech_count"), + ) + .join(Legislator, CRECSpeech.legislator_bioguide_id == Legislator.bioguide_id) + .where(and_(*conditions)) + ) + + query = query.group_by( + CRECSpeech.legislator_bioguide_id, + Legislator.first_name, + Legislator.last_name, + Legislator.party, + Legislator.state, + Legislator.image_url, + ) + + count_subquery = query.subquery() + total = await database.fetch_val( + select(func.count()).select_from(count_subquery) + ) + + query = query.order_by(desc("total_words")) + query = query.limit(limit).offset((page - 1) * limit) + + results = await database.fetch_all(query) + return [CRECSpeakerStatInfo(**r) for r in results], total + + +async def get_activity_calendar( + start_date: Optional[date] = None, + end_date: Optional[date] = None, +) -> List[CRECActivityItem]: + database = await get_database() + + query = ( + select( + CRECIssue.issue_date.label("date"), + func.count(CRECGranule.crec_granule_id).label("count"), + ) + .join(CRECGranule, CRECIssue.crec_issue_id == CRECGranule.crec_issue_id) + .group_by(CRECIssue.issue_date) + .order_by(CRECIssue.issue_date) + ) + + if start_date: + query = query.where(CRECIssue.issue_date >= start_date) + if end_date: + query = query.where(CRECIssue.issue_date <= end_date) + + results = await database.fetch_all(query) + return [CRECActivityItem(**r) for r in results] diff --git a/backend/congress_fastapi/models/congressional_record.py b/backend/congress_fastapi/models/congressional_record.py new file mode 100644 index 0000000..0f95753 --- /dev/null +++ b/backend/congress_fastapi/models/congressional_record.py @@ -0,0 +1,92 @@ +from typing import Annotated, List, Optional +from datetime import date, datetime + +from pydantic import BaseModel +from congress_fastapi.models.abstract import MappableBase +from congress_db.models import ( + CRECIssue, + CRECGranule, + CRECSpeech, + CRECBillReference, +) + + +class CRECIssueInfo(MappableBase): + crec_issue_id: Annotated[int, CRECIssue.crec_issue_id] + issue_date: Annotated[Optional[date], CRECIssue.issue_date] + congress_id: Annotated[Optional[int], CRECIssue.congress_id] + package_id: Annotated[Optional[str], CRECIssue.package_id] + + +class CRECIssueListResponse(BaseModel): + issues: List[CRECIssueInfo] + total_results: int + + +class CRECGranuleInfo(MappableBase): + crec_granule_id: Annotated[int, CRECGranule.crec_granule_id] + crec_issue_id: Annotated[Optional[int], CRECGranule.crec_issue_id] + granule_id: Annotated[Optional[str], CRECGranule.granule_id] + section: Annotated[Optional[str], CRECGranule.section] + title: Annotated[Optional[str], CRECGranule.title] + page_start: Annotated[Optional[str], CRECGranule.page_start] + page_end: Annotated[Optional[str], CRECGranule.page_end] + order_number: Annotated[Optional[int], CRECGranule.order_number] + + +class CRECIssueDetailResponse(BaseModel): + issue: CRECIssueInfo + granules: List[CRECGranuleInfo] + + +class CRECSpeechInfo(BaseModel): + crec_speech_id: int + speaker_raw: Optional[str] = None + legislator_bioguide_id: Optional[str] = None + order_number: Optional[int] = None + content_text: Optional[str] = None + word_count: Optional[int] = None + + +class CRECBillReferenceInfo(BaseModel): + crec_bill_reference_id: int + legislation_id: Optional[int] = None + cite_text: Optional[str] = None + cite_type: Optional[str] = None + start_offset: Optional[int] = None + end_offset: Optional[int] = None + + +class CRECSpeechDetailInfo(CRECSpeechInfo): + bill_references: List[CRECBillReferenceInfo] = [] + + +class CRECGranuleDetailResponse(BaseModel): + granule: CRECGranuleInfo + speeches: List[CRECSpeechDetailInfo] + summary: Optional[str] = None + + +class CRECSpeakerStatInfo(BaseModel): + bioguide_id: str + first_name: Optional[str] = None + last_name: Optional[str] = None + party: Optional[str] = None + state: Optional[str] = None + image_url: Optional[str] = None + total_words: int + speech_count: int + + +class CRECSpeakerStatsResponse(BaseModel): + speakers: List[CRECSpeakerStatInfo] + total_results: int + + +class CRECActivityItem(BaseModel): + date: date + count: int + + +class CRECActivityResponse(BaseModel): + activity: List[CRECActivityItem] diff --git a/backend/congress_fastapi/routes/congressional_record.py b/backend/congress_fastapi/routes/congressional_record.py new file mode 100644 index 0000000..2f95603 --- /dev/null +++ b/backend/congress_fastapi/routes/congressional_record.py @@ -0,0 +1,94 @@ +from typing import List, Optional +from datetime import date + +from fastapi import APIRouter, HTTPException, Query, status + +from congress_fastapi.handlers.congressional_record import ( + get_issues, + get_issue_detail, + get_granule_detail, + get_speaker_stats, + get_activity_calendar, +) +from congress_fastapi.models.congressional_record import ( + CRECIssueListResponse, + CRECIssueDetailResponse, + CRECGranuleDetailResponse, + CRECSpeakerStatsResponse, + CRECActivityResponse, +) +from congress_fastapi.models.errors import Error + +router = APIRouter(tags=["Congressional Record"]) + + +@router.get("/congressional-record") +async def list_issues( + page: int = Query(1, description="Page number"), + page_size: int = Query(20, alias="pageSize"), + start_date: Optional[date] = Query(None, alias="startDate"), + end_date: Optional[date] = Query(None, alias="endDate"), + chamber: Optional[str] = Query(None, description="Filter by chamber section"), +) -> CRECIssueListResponse: + """List Congressional Record daily issues with pagination and date filtering.""" + issues, total = await get_issues(start_date, end_date, chamber, page, page_size) + return CRECIssueListResponse(issues=issues, total_results=total) + + +@router.get( + "/congressional-record/{issue_id}", + responses={ + status.HTTP_404_NOT_FOUND: {"model": Error, "detail": "Issue not found"}, + }, +) +async def get_issue(issue_id: int) -> CRECIssueDetailResponse: + """Get a single Congressional Record issue with its granules.""" + issue, granules = await get_issue_detail(issue_id) + if issue is None: + raise HTTPException( + status_code=status.HTTP_404_NOT_FOUND, detail="Issue not found" + ) + return CRECIssueDetailResponse(issue=issue, granules=granules) + + +@router.get( + "/congressional-record/{issue_id}/granule/{granule_id}", + responses={ + status.HTTP_404_NOT_FOUND: {"model": Error, "detail": "Granule not found"}, + }, +) +async def get_granule(issue_id: int, granule_id: int) -> CRECGranuleDetailResponse: + """Get a granule with all its speeches and bill references.""" + granule, speeches, summary = await get_granule_detail(granule_id) + if granule is None: + raise HTTPException( + status_code=status.HTTP_404_NOT_FOUND, detail="Granule not found" + ) + return CRECGranuleDetailResponse( + granule=granule, speeches=speeches, summary=summary + ) + + +@router.get("/congressional-record/stats/speakers") +async def speaker_statistics( + start_date: Optional[date] = Query(None, alias="startDate"), + end_date: Optional[date] = Query(None, alias="endDate"), + chamber: Optional[str] = Query(None), + limit: int = Query(20), + page: int = Query(1), +) -> CRECSpeakerStatsResponse: + """Get speaker statistics - who talked the most by word count.""" + speakers, total = await get_speaker_stats( + start_date, end_date, chamber, limit, page + ) + return CRECSpeakerStatsResponse(speakers=speakers, total_results=total) + + +@router.get("/congressional-record/stats/activity") +async def activity_calendar( + start_date: Optional[date] = Query(None, alias="startDate"), + end_date: Optional[date] = Query(None, alias="endDate"), +) -> CRECActivityResponse: + """Get activity calendar data for heatmap visualization.""" + activity = await get_activity_calendar(start_date, end_date) + return CRECActivityResponse(activity=activity) diff --git a/backend/congress_parser/importers/congressional_record.py b/backend/congress_parser/importers/congressional_record.py new file mode 100644 index 0000000..c0061b8 --- /dev/null +++ b/backend/congress_parser/importers/congressional_record.py @@ -0,0 +1,445 @@ +""" +Congressional Record importer. + +Downloads and parses daily Congressional Record PDFs from govinfo.gov. + +PDF URL pattern: + https://www.govinfo.gov/content/pkg/CREC-{year}-{month:02d}-{day:02d}/pdf/CREC-{year}-{month:02d}-{day:02d}.pdf + +The importer: + 1. Iterates through dates in the current Congress (119th, starting Jan 2025) + 2. Downloads daily PDF + 3. Extracts text and splits by section (Senate, House, Extensions, Daily Digest) + 4. Identifies speakers and resolves them to legislator bioguide IDs + 5. Extracts bill references from speech text + 6. Stores everything in the crec_* tables + +Usage: + python -m congress_parser.importers.congressional_record +""" + +from datetime import datetime, date, timedelta +import io +import logging +import re +import requests + +from pdfminer.high_level import extract_text_to_fp +from pdfminer.layout import LAParams +from unidecode import unidecode + +from congress_db.session import Session +from congress_db.models import ( + CRECIssue, + CRECGranule, + CRECSpeech, + CRECBillReference, + CRECSection, + Congress, + Legislator, + Legislation, + LegislationChamber, + LegislationType, +) + +from congress_parser.utils.cite_parser import extract_bill_references + +logger = logging.getLogger(__name__) + +PDF_URL = "https://www.govinfo.gov/content/pkg/CREC-{year}-{month:02d}-{day:02d}/pdf/CREC-{year}-{month:02d}-{day:02d}.pdf" + +# Section header patterns in the PDF +SECTION_HEADERS = { + re.compile(r"^\s*SENATE\s*$", re.MULTILINE): CRECSection.Senate, + re.compile(r"^\s*HOUSE OF REPRESENTATIVES\s*$", re.MULTILINE): CRECSection.House, + re.compile(r"^\s*EXTENSIONS OF REMARKS\s*$", re.MULTILINE): CRECSection.Extensions, + re.compile(r"^\s*DAILY DIGEST\s*$", re.MULTILINE): CRECSection.DailyDigest, +} + +# Heading pattern: all-caps line that marks a new topic/granule +HEADING_PATTERN = re.compile(r"^([A-Z][A-Z0-9 \-\'\.,]{4,})$") + + +def calculate_congress_from_year() -> int: + current_year = datetime.now().year + return ((current_year - 2001) // 2) + 107 + + +def get_congress_start_date(congress_number: int) -> date: + """Get the start date for a given Congress number.""" + start_year = 2001 + (congress_number - 107) * 2 + return date(start_year, 1, 3) + + +def map_section(filename: str) -> CRECSection: + """Map a CREC filename or path to a CRECSection enum.""" + lower = filename.lower() + if "/senate/" in lower or "senate" in lower: + return CRECSection.Senate + elif "/house/" in lower or "house" in lower: + return CRECSection.House + elif "/extensions/" in lower or "extension" in lower: + return CRECSection.Extensions + elif "/dailydigest/" in lower or "digest" in lower: + return CRECSection.DailyDigest + return CRECSection.Senate + + +SPEAKER_PATTERN = re.compile( + r"^(?:Mr|Mrs|Ms|Miss|Madam|The)\.\s+([A-Z][A-Z\-\']+(?:\s+[A-Z][A-Z\-\']+)*)", + re.IGNORECASE, +) + +SPEAKER_PREFIX_PATTERN = re.compile( + r"^(?:Mr|Mrs|Ms|Miss)\.\s+(?:Speaker|President|SPEAKER|PRESIDENT)?\s*[,.]?\s*(?:Mr|Mrs|Ms|Miss)\.\s+([A-Z][A-Z\-\']+)", + re.IGNORECASE, +) + + +def extract_speaker_name(text: str) -> str: + """Extract speaker last name from a speech paragraph.""" + text = text.strip() + match = SPEAKER_PREFIX_PATTERN.match(text) + if match: + return match.group(1).upper() + match = SPEAKER_PATTERN.match(text) + if match: + name = match.group(1).upper() + if name not in ("SPEAKER", "PRESIDENT", "CHAIR", "CHAIRMAN", "CHAIRWOMAN"): + return name + return "" + + +def resolve_speaker(speaker_name: str, chamber: CRECSection, session) -> str: + """ + Resolve a raw speaker last name to a legislator bioguide_id. + Uses chamber to disambiguate between House and Senate members. + """ + if not speaker_name: + return None + + from congress_db.models import LegislatorJob + + job = None + if chamber == CRECSection.Senate: + job = LegislatorJob.Senator + elif chamber == CRECSection.House: + job = LegislatorJob.Representative + + query = session.query(Legislator).filter( + Legislator.last_name == speaker_name.title() + ) + if job: + query = query.filter(Legislator.job == job) + + legislators = query.all() + if len(legislators) == 1: + return legislators[0].bioguide_id + elif len(legislators) > 1: + current_congress = calculate_congress_from_year() + for leg in legislators: + if leg.congress_id and current_congress in leg.congress_id: + return leg.bioguide_id + return legislators[0].bioguide_id + + return None + + +def resolve_bill_reference(ref: dict, congress_id: int, session) -> int: + """Resolve a bill reference dict to a legislation_id, or None.""" + chamber_map = { + "House": LegislationChamber.House, + "Senate": LegislationChamber.Senate, + } + type_map = { + "Bill": LegislationType.Bill, + "Resolution": LegislationType.Res, + "Joint Resolution": LegislationType.JRes, + "Continuing Resolution": LegislationType.CRes, + } + + chamber = chamber_map.get(ref["chamber"]) + leg_type = type_map.get(ref["legislation_type"]) + + if not chamber or not leg_type: + return None + + legislation = ( + session.query(Legislation) + .filter( + Legislation.chamber == chamber, + Legislation.number == ref["number"], + Legislation.legislation_type == leg_type, + Legislation.congress_id == congress_id, + ) + .first() + ) + return legislation.legislation_id if legislation else None + + +def extract_pdf_text(pdf_bytes: bytes) -> str: + """Extract plain text from PDF bytes using pdfminer.""" + output = io.StringIO() + laparams = LAParams(line_margin=0.5, word_margin=0.1) + extract_text_to_fp(io.BytesIO(pdf_bytes), output, laparams=laparams, output_type="text", codec="utf-8") + return output.getvalue() + + +def split_pdf_into_sections(text: str) -> list: + """ + Split raw PDF text into sections (Senate, House, Extensions, DailyDigest). + Returns a list of dicts: {section: CRECSection, text: str} + """ + # Find all section header positions + positions = [] + for pattern, section in SECTION_HEADERS.items(): + for m in pattern.finditer(text): + positions.append((m.start(), section, m.end())) + + if not positions: + return [{"section": CRECSection.Senate, "text": text}] + + positions.sort(key=lambda x: x[0]) + sections = [] + for i, (start, section, end) in enumerate(positions): + section_text = text[end: positions[i + 1][0] if i + 1 < len(positions) else len(text)] + sections.append({"section": section, "text": section_text.strip()}) + + return sections + + +def split_section_into_granules(section_text: str, section: CRECSection) -> list: + """ + Split a section's text into granules (topics) based on all-caps headings. + Returns a list of dicts: {title: str, text: str} + """ + lines = section_text.split("\n") + granules = [] + current_title = "General" + current_lines = [] + + for line in lines: + stripped = line.strip() + if not stripped: + current_lines.append(line) + continue + if HEADING_PATTERN.match(stripped) and len(stripped) > 5: + if current_lines and any(l.strip() for l in current_lines): + granules.append({"title": current_title, "text": "\n".join(current_lines).strip()}) + current_title = stripped.title() + current_lines = [] + else: + current_lines.append(line) + + if current_lines and any(l.strip() for l in current_lines): + granules.append({"title": current_title, "text": "\n".join(current_lines).strip()}) + + return granules if granules else [{"title": "General", "text": section_text}] + + +def parse_granule_speeches(granule_text: str) -> list: + """ + Parse a granule's text into speech segments by speaker. + Returns a list of dicts: {speaker_raw, content_text, order_number} + """ + speeches = [] + current_speaker = "" + current_lines = [] + order = 0 + + for line in granule_text.split("\n"): + text = unidecode(line.strip()) + if not text: + if current_lines: + current_lines.append("") + continue + + speaker = extract_speaker_name(text) + if speaker and speaker != current_speaker: + if current_lines: + full_text = "\n".join(current_lines).strip() + if full_text: + speeches.append({ + "speaker_raw": current_speaker, + "content_text": full_text, + "order_number": order, + }) + order += 1 + current_speaker = speaker + current_lines = [text] + else: + current_lines.append(text) + + if current_lines: + full_text = "\n".join(current_lines).strip() + if full_text: + speeches.append({ + "speaker_raw": current_speaker, + "content_text": full_text, + "order_number": order, + }) + + return speeches + + +def import_daily_record(issue_date: date, session, congress_id: int): + """Import a single day's Congressional Record from the PDF.""" + package_id = f"CREC-{issue_date.isoformat()}" + + existing = session.query(CRECIssue).filter( + CRECIssue.package_id == package_id + ).first() + if existing: + logger.info(f"Skipping {package_id} - already imported") + return + + url = PDF_URL.format( + year=issue_date.year, + month=issue_date.month, + day=issue_date.day, + ) + + logger.info(f"Downloading {url}") + try: + resp = requests.get(url, timeout=120) + except requests.RequestException as e: + logger.warning(f"Failed to download {url}: {e}") + return + + if resp.status_code == 404: + logger.debug(f"No record for {issue_date} (404)") + return + if resp.status_code != 200: + logger.warning(f"Unexpected status {resp.status_code} for {url}") + return + + logger.info(f"Extracting text from {package_id} PDF ({len(resp.content)} bytes)") + try: + full_text = extract_pdf_text(resp.content) + except Exception as e: + logger.warning(f"Failed to extract PDF text for {package_id}: {e}") + return + + issue = CRECIssue( + issue_date=issue_date, + congress_id=congress_id, + package_id=package_id, + ) + session.add(issue) + session.flush() + + speaker_cache = {} + granule_order = 0 + + sections = split_pdf_into_sections(full_text) + for sec in sections: + section = sec["section"] + granules = split_section_into_granules(sec["text"], section) + + for gran in granules: + granule = CRECGranule( + crec_issue_id=issue.crec_issue_id, + granule_id=f"{package_id}/{section.value}/{granule_order}", + section=section, + title=gran["title"], + order_number=granule_order, + ) + session.add(granule) + session.flush() + granule_order += 1 + + speech_segments = parse_granule_speeches(gran["text"]) + for seg in speech_segments: + speaker_raw = seg["speaker_raw"] + + if speaker_raw in speaker_cache: + bioguide_id = speaker_cache[speaker_raw] + else: + bioguide_id = resolve_speaker(speaker_raw, section, session) + speaker_cache[speaker_raw] = bioguide_id + + content_text = seg["content_text"] + word_count = len(content_text.split()) + + speech = CRECSpeech( + crec_granule_id=granule.crec_granule_id, + speaker_raw=speaker_raw or None, + legislator_bioguide_id=bioguide_id, + order_number=seg["order_number"], + content_text=content_text, + word_count=word_count, + ) + session.add(speech) + session.flush() + + bill_refs = extract_bill_references(content_text) + seen_refs = set() + for ref in bill_refs: + ref_key = (ref["chamber"], ref["number"], ref["legislation_type"]) + if ref_key in seen_refs: + continue + seen_refs.add(ref_key) + + legislation_id = resolve_bill_reference(ref, congress_id, session) + + bill_reference = CRECBillReference( + crec_speech_id=speech.crec_speech_id, + legislation_id=legislation_id, + cite_text=ref["cite_text"], + cite_type=ref["cite_type"], + start_offset=ref["start"], + end_offset=ref["end"], + ) + session.add(bill_reference) + + session.commit() + logger.info(f"Imported {package_id}: {granule_order} granules") + return True + + +def run_import(start_date: date = None, end_date: date = None): + """Run the Congressional Record import for a date range.""" + db = Session() + + congress_number = calculate_congress_from_year() + congress = db.query(Congress).filter( + Congress.session_number == congress_number + ).first() + + if not congress: + logger.error(f"Congress {congress_number} not found in database") + return + + congress_id = congress.congress_id + + if start_date is None: + latest = db.query(CRECIssue).order_by( + CRECIssue.issue_date.desc() + ).first() + if latest: + start_date = latest.issue_date + timedelta(days=1) + else: + start_date = get_congress_start_date(congress_number) + + if end_date is None: + end_date = date.today() + + logger.info(f"Importing Congressional Record from {start_date} to {end_date}") + + current = start_date + while current <= end_date: + if current.weekday() < 5: + try: + import_daily_record(current, db, congress_id) + + except Exception as e: + logger.error(f"Error importing {current}: {e}", exc_info=True) + db.rollback() + current += timedelta(days=1) + + db.close() + + +if __name__ == "__main__": + logging.basicConfig(level=logging.INFO) + run_import() diff --git a/backend/congress_parser/prompt_runners/crec_summarizer.py b/backend/congress_parser/prompt_runners/crec_summarizer.py new file mode 100644 index 0000000..953bc47 --- /dev/null +++ b/backend/congress_parser/prompt_runners/crec_summarizer.py @@ -0,0 +1,257 @@ +""" +Congressional Record debate summarizer. + +Generates LLM summaries at two levels: + 1. Granule-level: Summarizes each debate/segment individually + 2. Daily-level: Aggregates granule summaries into per-chamber daily overviews + +Follows the section_summarizer.py pattern with PromptBatch tracking. +""" + +import json +import logging +from datetime import datetime +from typing import List + +from congress_db.models import ( + CRECGranule, + CRECIssue, + CRECSpeech, + CRECSummary, + PromptBatch, +) +from congress_db.session import Session +from congress_parser.prompt_runners.utils import run_query + +logger = logging.getLogger(__name__) + +GRANULE_SUMMARY_PROMPT = """Summarize the following Congressional debate transcript. +Identify: +- The key topics discussed +- Which legislators spoke and their positions +- Any bills or legislation referenced +- Key arguments made for and against + +Keep the summary to 2-3 concise paragraphs. Output JSON with a "summary" key. + +TRANSCRIPT: +{transcript}""" + +DAILY_SUMMARY_PROMPT = """Summarize the following Congressional Record summaries for a single day. +These are summaries of individual debates and proceedings from the {chamber}. + +Provide a cohesive overview of the day's activities in 2-3 paragraphs. Output JSON with a "summary" key. + +SUMMARIES: +{summaries}""" + + +def build_transcript(speeches: List[CRECSpeech]) -> str: + """Build a formatted transcript from speech segments.""" + lines = [] + for speech in speeches: + speaker = speech.speaker_raw or "UNKNOWN" + lines.append(f"{speaker}: {speech.content_text}") + return "\n\n".join(lines) + + +def summarize_granule(granule_id: int, model: str = "ollama/gemma2:27b") -> str: + """Generate a summary for a single granule (debate segment).""" + with Session() as session: + speeches = ( + session.query(CRECSpeech) + .filter(CRECSpeech.crec_granule_id == granule_id) + .order_by(CRECSpeech.order_number) + .all() + ) + + if not speeches: + return None + + transcript = build_transcript(speeches) + + if len(transcript) < 100: + return None + + # Truncate very long transcripts to fit context + if len(transcript) > 50000: + transcript = transcript[:50000] + "\n\n[TRUNCATED]" + + prompt = GRANULE_SUMMARY_PROMPT.format(transcript=transcript) + + try: + response = run_query( + prompt, + model=model, + num_ctx=32768, + json=True, + max_tokens=2000, + ) + content = response.choices[0].message.content + parsed = json.loads(content) + return parsed.get("summary", content) + except Exception as e: + logger.error(f"Failed to summarize granule {granule_id}: {e}") + return None + + +def summarize_daily_chamber( + issue_id: int, chamber: str, granule_summaries: List[str], + model: str = "ollama/gemma2:27b", +) -> str: + """Generate a daily chamber-level summary from granule summaries.""" + if not granule_summaries: + return None + + combined = "\n\n---\n\n".join(granule_summaries) + + if len(combined) > 30000: + combined = combined[:30000] + "\n\n[TRUNCATED]" + + prompt = DAILY_SUMMARY_PROMPT.format( + chamber=chamber, + summaries=combined, + ) + + try: + response = run_query( + prompt, + model=model, + num_ctx=32768, + json=True, + max_tokens=2000, + ) + content = response.choices[0].message.content + parsed = json.loads(content) + return parsed.get("summary", content) + except Exception as e: + logger.error(f"Failed to summarize daily {chamber} for issue {issue_id}: {e}") + return None + + +def run_crec_summarizer(prompt_id: int = None, model: str = "ollama/gemma2:27b"): + """ + Summarize all unsummarized Congressional Record granules and daily issues. + """ + with Session() as session: + # Find granules without summaries + summarized_granule_ids = ( + session.query(CRECSummary.crec_granule_id) + .filter( + CRECSummary.crec_granule_id.isnot(None), + CRECSummary.summary_type == "granule", + ) + .subquery() + ) + + unsummarized_granules = ( + session.query(CRECGranule) + .filter(CRECGranule.crec_granule_id.notin_( + session.query(summarized_granule_ids) + )) + .order_by(CRECGranule.crec_granule_id.desc()) + .all() + ) + + logger.info(f"Found {len(unsummarized_granules)} unsummarized granules") + + prompt_batch = PromptBatch( + prompt_id=prompt_id, + legislation_version_id=None, + attempted=0, + successful=0, + failed=0, + skipped=0, + created_at=datetime.now(), + ) + session.add(prompt_batch) + session.commit() + + for granule in unsummarized_granules: + prompt_batch.attempted += 1 + + summary_text = summarize_granule(granule.crec_granule_id, model=model) + if summary_text: + summary = CRECSummary( + crec_granule_id=granule.crec_granule_id, + crec_issue_id=granule.crec_issue_id, + summary=summary_text, + summary_type="granule", + prompt_batch_id=prompt_batch.prompt_batch_id, + ) + session.add(summary) + prompt_batch.successful += 1 + else: + prompt_batch.skipped += 1 + + session.commit() + + # Now generate daily summaries for issues with granule summaries but no daily summary + summarized_issue_ids = ( + session.query(CRECSummary.crec_issue_id) + .filter( + CRECSummary.crec_issue_id.isnot(None), + CRECSummary.summary_type == "daily", + ) + .subquery() + ) + + issues_needing_daily = ( + session.query(CRECIssue) + .filter(CRECIssue.crec_issue_id.notin_( + session.query(summarized_issue_ids) + )) + .order_by(CRECIssue.issue_date.desc()) + .all() + ) + + for issue in issues_needing_daily: + granule_summaries_by_chamber = {} + granule_summaries = ( + session.query(CRECSummary) + .join(CRECGranule, CRECSummary.crec_granule_id == CRECGranule.crec_granule_id) + .filter( + CRECSummary.crec_issue_id == issue.crec_issue_id, + CRECSummary.summary_type == "granule", + ) + .all() + ) + + for gs in granule_summaries: + granule = session.query(CRECGranule).get(gs.crec_granule_id) + if granule: + chamber = granule.section.value if granule.section else "Unknown" + if chamber not in granule_summaries_by_chamber: + granule_summaries_by_chamber[chamber] = [] + granule_summaries_by_chamber[chamber].append(gs.summary) + + for chamber, summaries in granule_summaries_by_chamber.items(): + prompt_batch.attempted += 1 + daily_text = summarize_daily_chamber( + issue.crec_issue_id, chamber, summaries, model=model, + ) + if daily_text: + daily_summary = CRECSummary( + crec_issue_id=issue.crec_issue_id, + summary=daily_text, + summary_type="daily", + prompt_batch_id=prompt_batch.prompt_batch_id, + ) + session.add(daily_summary) + prompt_batch.successful += 1 + else: + prompt_batch.skipped += 1 + + session.commit() + + prompt_batch.completed_at = datetime.now() + session.commit() + logger.info( + f"Summarization complete: {prompt_batch.successful} successful, " + f"{prompt_batch.failed} failed, {prompt_batch.skipped} skipped" + ) + + +if __name__ == "__main__": + logging.basicConfig(level=logging.INFO) + run_crec_summarizer() diff --git a/backend/congress_parser/utils/cite_parser.py b/backend/congress_parser/utils/cite_parser.py index 81036aa..7456547 100644 --- a/backend/congress_parser/utils/cite_parser.py +++ b/backend/congress_parser/utils/cite_parser.py @@ -59,6 +59,78 @@ ) SUB_OF_REGEX = re.compile(r"sub(?:section)?\s\((.)\)", re.IGNORECASE) +# Congressional bill reference patterns for detecting citations in debate text +# Matches: H.R. 1234, S. 567, H.J.Res. 12, S.Con.Res. 5, H.Res. 89, etc. +BILL_REF_REGEX = re.compile( + r"(?P" + r"H\.?\s*R\.?" + r"|S\.?\s*J\.?\s*Res\.?" + r"|H\.?\s*J\.?\s*Res\.?" + r"|S\.?\s*Con\.?\s*Res\.?" + r"|H\.?\s*Con\.?\s*Res\.?" + r"|H\.?\s*Res\.?" + r"|S\.?\s*Res\.?" + r"|S\.?" + r")\s*(?P\d+)", + re.IGNORECASE, +) + +# Map from citation prefix patterns to (chamber, legislation_type) tuples +BILL_PREFIX_MAP = { + "hr": ("House", "Bill"), + "s": ("Senate", "Bill"), + "hjres": ("House", "Joint Resolution"), + "sjres": ("Senate", "Joint Resolution"), + "hconres": ("House", "Continuing Resolution"), + "sconres": ("Senate", "Continuing Resolution"), + "hres": ("House", "Resolution"), + "sres": ("Senate", "Resolution"), +} + + +class BillRefObject(TypedDict): + cite_text: str + chamber: str + legislation_type: str + number: int + cite_type: str + start: int + end: int + + +def _normalize_prefix(prefix: str) -> str: + """Normalize a bill prefix like 'H. R.' or 'H.J. Res.' to a lookup key.""" + return re.sub(r"[\s.]", "", prefix).lower() + + +def extract_bill_references(text: str) -> List[BillRefObject]: + """ + Extract Congressional bill references from text (e.g. debate transcripts). + + Returns a list of BillRefObject dicts with cite_text, chamber, legislation_type, + number, start/end character offsets, and cite_type="bill". + """ + results: List[BillRefObject] = [] + for match in BILL_REF_REGEX.finditer(text): + prefix = _normalize_prefix(match.group("prefix")) + number = int(match.group("number")) + + mapping = BILL_PREFIX_MAP.get(prefix) + if mapping is None: + continue + + chamber, leg_type = mapping + results.append({ + "cite_text": match.group(0), + "chamber": chamber, + "legislation_type": leg_type, + "number": number, + "cite_type": "bill", + "start": match.start(), + "end": match.end(), + }) + return results + def find_extra_clause_references(snippet): """ diff --git a/backend/requirements.txt b/backend/requirements.txt index 1f456e5..f2c28db 100644 --- a/backend/requirements.txt +++ b/backend/requirements.txt @@ -36,6 +36,7 @@ chromadb~=0.6.3 # Parsing tools lxml==6.0.2 +pdfminer.six==20231228 genson==1.3.0 jsonschema==4.25.1 pandas==2.3.3 diff --git a/hillstack/prisma/schema.prisma b/hillstack/prisma/schema.prisma index de61f11..9a7d049 100644 --- a/hillstack/prisma/schema.prisma +++ b/hillstack/prisma/schema.prisma @@ -64,6 +64,7 @@ model prompt_batch { legislation_content_summary legislation_content_summary[] legislation_content_tag legislation_content_tag[] legislation_version_tag legislation_version_tag[] + crec_summary crec_summary[] @@index([legislation_version_id], map: "ix_prompts_prompt_batch_legislation_version_id") @@index([prompt_id], map: "ix_prompts_prompt_batch_prompt_id") @@ -87,6 +88,7 @@ model congress { legislation_vote legislation_vote[] legislative_policy_area legislative_policy_area[] legislative_subject legislative_subject[] + crec_issue crec_issue[] @@schema("public") } @@ -111,6 +113,7 @@ model legislation { legislative_policy_area_association legislative_policy_area_association[] legislative_subject_association legislative_subject_association[] user_legislation user_legislation[] + crec_bill_reference crec_bill_reference[] @@unique([chamber, number, legislation_type, congress_id], map: "unq_bill") @@index([chamber], map: "ix_legislation_chamber") @@ -368,6 +371,7 @@ model legislator { legislation_sponsorship legislation_sponsorship[] legislator_vote legislator_vote[] user_legislator user_legislator[] + crec_speech crec_speech[] @@index([district], map: "ix_legislator_district") @@index([lis_id], map: "ix_legislator_lis_id") @@ -742,6 +746,100 @@ model verification_token { @@schema("authentication") } +model crec_issue { + crec_issue_id Int @id @default(autoincrement()) + issue_date DateTime? @unique(map: "ix_crec_issue_issue_date") @db.Date + congress_id Int? + package_id String? @unique(map: "uq_crec_issue_package_id") @db.VarChar + created_at DateTime? @default(now()) @db.Timestamp(6) + congress congress? @relation(fields: [congress_id], references: [congress_id], onDelete: Cascade, onUpdate: NoAction) + crec_granule crec_granule[] + crec_summary crec_summary[] + + @@index([congress_id], map: "ix_crec_issue_congress_id") + @@schema("public") +} + +model crec_granule { + crec_granule_id Int @id @default(autoincrement()) + crec_issue_id Int? + granule_id String? @unique(map: "uq_crec_granule_granule_id") @db.VarChar + section crecsection? + title String? @db.VarChar + page_start String? @db.VarChar + page_end String? @db.VarChar + order_number Int? + created_at DateTime? @default(now()) @db.Timestamp(6) + crec_issue crec_issue? @relation(fields: [crec_issue_id], references: [crec_issue_id], onDelete: Cascade, onUpdate: NoAction) + crec_speech crec_speech[] + crec_summary crec_summary[] + + @@index([crec_issue_id], map: "ix_crec_granule_crec_issue_id") + @@index([section], map: "ix_crec_granule_section") + @@schema("public") +} + +model crec_speech { + crec_speech_id Int @id @default(autoincrement()) + crec_granule_id Int? + speaker_raw String? @db.VarChar + legislator_bioguide_id String? @db.VarChar + order_number Int? + content_text String? @db.VarChar + word_count Int? + created_at DateTime? @default(now()) @db.Timestamp(6) + crec_granule crec_granule? @relation(fields: [crec_granule_id], references: [crec_granule_id], onDelete: Cascade, onUpdate: NoAction) + legislator legislator? @relation(fields: [legislator_bioguide_id], references: [bioguide_id], onDelete: SetNull, onUpdate: NoAction) + crec_bill_reference crec_bill_reference[] + + @@index([crec_granule_id], map: "ix_crec_speech_crec_granule_id") + @@index([legislator_bioguide_id], map: "ix_crec_speech_legislator_bioguide_id") + @@schema("public") +} + +model crec_bill_reference { + crec_bill_reference_id Int @id @default(autoincrement()) + crec_speech_id Int? + legislation_id Int? + cite_text String? @db.VarChar + cite_type String? @db.VarChar + start_offset Int? + end_offset Int? + crec_speech crec_speech? @relation(fields: [crec_speech_id], references: [crec_speech_id], onDelete: Cascade, onUpdate: NoAction) + legislation legislation? @relation(fields: [legislation_id], references: [legislation_id], onDelete: SetNull, onUpdate: NoAction) + + @@index([crec_speech_id], map: "ix_crec_bill_reference_crec_speech_id") + @@index([legislation_id], map: "ix_crec_bill_reference_legislation_id") + @@schema("public") +} + +model crec_summary { + crec_summary_id Int @id @default(autoincrement()) + crec_granule_id Int? + crec_issue_id Int? + summary String? @db.VarChar + summary_type String? @db.VarChar + prompt_batch_id Int? + created_at DateTime? @default(now()) @db.Timestamp(6) + crec_granule crec_granule? @relation(fields: [crec_granule_id], references: [crec_granule_id], onDelete: Cascade, onUpdate: NoAction) + crec_issue crec_issue? @relation(fields: [crec_issue_id], references: [crec_issue_id], onDelete: Cascade, onUpdate: NoAction) + prompt_batch prompt_batch? @relation(fields: [prompt_batch_id], references: [prompt_batch_id], onDelete: Cascade, onUpdate: NoAction) + + @@index([crec_granule_id], map: "ix_crec_summary_crec_granule_id") + @@index([crec_issue_id], map: "ix_crec_summary_crec_issue_id") + @@index([prompt_batch_id], map: "ix_crec_summary_prompt_batch_id") + @@schema("prompts") +} + +enum crecsection { + Senate + House + Extensions + DailyDigest + + @@schema("public") +} + enum legislationchamber { House Senate diff --git a/hillstack/src/app/congress/bills/[billId]/debates/page.tsx b/hillstack/src/app/congress/bills/[billId]/debates/page.tsx new file mode 100644 index 0000000..045c3d6 --- /dev/null +++ b/hillstack/src/app/congress/bills/[billId]/debates/page.tsx @@ -0,0 +1,162 @@ +'use client'; + +import Avatar from '@mui/material/Avatar'; +import Box from '@mui/material/Box'; +import Chip from '@mui/material/Chip'; +import Paper from '@mui/material/Paper'; +import Skeleton from '@mui/material/Skeleton'; +import Typography from '@mui/material/Typography'; +import Link from 'next/link'; +import { useParams } from 'next/navigation'; +import { api } from '~/trpc/react'; + +export default function BillDebatesPage() { + const params = useParams(); + const billId = Number(params.billId); + + const { data, isLoading } = api.congressionalRecord.debatesForBill.useQuery({ + legislationId: billId, + }); + + if (isLoading) { + return ( + + {Array.from({ length: 5 }).map((_, i) => ( + + ))} + + ); + } + + if (!data || data.length === 0) { + return ( + + + No Congressional Record debates found referencing this bill. + + + ); + } + + // Group by granule for better display + const byGranule = new Map(); + + for (const ref of data) { + const speech = ref.crec_speech; + if (!speech?.crec_granule) continue; + const gId = speech.crec_granule.crec_granule_id; + if (!byGranule.has(gId)) { + byGranule.set(gId, { + granuleId: gId, + issueId: speech.crec_granule.crec_issue_id ?? 0, + title: speech.crec_granule.title ?? 'Untitled Debate', + section: speech.crec_granule.section, + issueDate: speech.crec_granule.crec_issue?.issue_date ?? null, + speeches: [], + }); + } + byGranule.get(gId)?.speeches.push(ref); + } + + return ( + + + {data.length} mention{data.length !== 1 ? 's' : ''} found in the Congressional Record + + + {Array.from(byGranule.values()).map((group) => { + const dateStr = group.issueDate + ? new Date(group.issueDate).toLocaleDateString('en-US', { + year: 'numeric', + month: 'long', + day: 'numeric', + }) + : ''; + + return ( + + + + + {group.title} + + + {group.section && ( + + )} + {dateStr && ( + + {dateStr} + + )} + + + {group.speeches.slice(0, 3).map((ref) => { + const speech = ref.crec_speech; + if (!speech) return null; + const leg = speech.legislator; + const name = leg + ? `${leg.first_name ?? ''} ${leg.last_name ?? ''}`.trim() + : speech.speaker_raw ?? 'Unknown'; + + return ( + + + {name[0] ?? '?'} + + + + {name} + {leg?.party ? ` (${leg.party})` : ''} + + + {speech.content_text} + + + + ); + })} + + {group.speeches.length > 3 && ( + + + View all {group.speeches.length} mentions in this debate + + + )} + + ); + })} + + ); +} diff --git a/hillstack/src/app/congress/legislators/[bioguideId]/page.tsx b/hillstack/src/app/congress/legislators/[bioguideId]/page.tsx index 20e74a9..c2abc90 100644 --- a/hillstack/src/app/congress/legislators/[bioguideId]/page.tsx +++ b/hillstack/src/app/congress/legislators/[bioguideId]/page.tsx @@ -19,6 +19,7 @@ import { import type { Params } from 'next/dist/server/request/params'; import Link from 'next/link'; import { LegislatorFollow } from '~/app/congress/legislators/[bioguideId]/follow'; +import { LegislatorSpeakingStats } from '~/components/record/LegislatorSpeakingStats'; import { stateAbbreviations } from '~/constants'; import { api, HydrateClient } from '~/trpc/server'; @@ -363,6 +364,7 @@ export default async function LegislatorPage({ ))} + diff --git a/hillstack/src/app/congress/record/[issueId]/[granuleId]/page.tsx b/hillstack/src/app/congress/record/[issueId]/[granuleId]/page.tsx new file mode 100644 index 0000000..a39950c --- /dev/null +++ b/hillstack/src/app/congress/record/[issueId]/[granuleId]/page.tsx @@ -0,0 +1,86 @@ +import Box from '@mui/material/Box'; +import Breadcrumbs from '@mui/material/Breadcrumbs'; +import Chip from '@mui/material/Chip'; +import Paper from '@mui/material/Paper'; +import Typography from '@mui/material/Typography'; +import Link from 'next/link'; +import { api } from '~/trpc/server'; +import { HydrateClient } from '~/trpc/server'; +import { SpeechCard } from '~/components/record/SpeechCard'; + +export default async function GranulePage({ + params, +}: { + params: Promise<{ issueId: string; granuleId: string }>; +}) { + const { issueId, granuleId } = await params; + const granule = await api.congressionalRecord.getGranule({ + granuleId: Number(granuleId), + }); + + const summary = granule.crec_summary[0]?.summary; + + return ( + + + + Record + + Daily Issue + + Debate + + + + {granule.title || 'Congressional Record Entry'} + + + + {granule.section && ( + + )} + {granule.page_start && ( + + )} + + + {summary && ( + + + AI Summary + + {summary} + + )} + + + Transcript + + + {granule.crec_speech.map((speech) => ( + + ))} + + {granule.crec_speech.length === 0 && ( + + No speech segments found for this entry. + + )} + + + ); +} diff --git a/hillstack/src/app/congress/record/[issueId]/page.tsx b/hillstack/src/app/congress/record/[issueId]/page.tsx new file mode 100644 index 0000000..325c721 --- /dev/null +++ b/hillstack/src/app/congress/record/[issueId]/page.tsx @@ -0,0 +1,143 @@ +import Box from '@mui/material/Box'; +import Chip from '@mui/material/Chip'; +import Divider from '@mui/material/Divider'; +import Grid from '@mui/material/Grid'; +import Paper from '@mui/material/Paper'; +import Typography from '@mui/material/Typography'; +import Link from 'next/link'; +import { api } from '~/trpc/server'; +import { HydrateClient } from '~/trpc/server'; + +export default async function IssuePage({ + params, +}: { + params: Promise<{ issueId: string }>; +}) { + const { issueId } = await params; + const issue = await api.congressionalRecord.getIssue({ + issueId: Number(issueId), + }); + + const dateStr = issue.issue_date + ? new Date(issue.issue_date).toLocaleDateString('en-US', { + weekday: 'long', + year: 'numeric', + month: 'long', + day: 'numeric', + }) + : 'Unknown Date'; + + const sections = ['Senate', 'House', 'Extensions', 'DailyDigest'] as const; + const granulesBySection: Record = {}; + for (const g of issue.crec_granule) { + const s = g.section ?? 'Other'; + if (!granulesBySection[s]) granulesBySection[s] = []; + granulesBySection[s].push(g); + } + + return ( + + + + Congressional Record + + + {dateStr} + + + {issue.crec_summary.length > 0 && ( + + + AI Summary + + {issue.crec_summary.map((s, i) => ( + + {s.summary} + + ))} + + )} + + + {sections.map((section) => { + const granules = granulesBySection[section]; + if (!granules || granules.length === 0) return null; + + return ( + + + + {section === 'DailyDigest' + ? 'Daily Digest' + : section === 'Extensions' + ? 'Extensions of Remarks' + : section} + + + {granules.map((g) => ( + + + {g.title || 'Untitled'} + + + {g.page_start && ( + + )} + + + {g.crec_summary[0]?.summary && ( + + {g.crec_summary[0].summary} + + )} + + ))} + + + ); + })} + + + + ); +} diff --git a/hillstack/src/app/congress/record/page.tsx b/hillstack/src/app/congress/record/page.tsx new file mode 100644 index 0000000..651c8e4 --- /dev/null +++ b/hillstack/src/app/congress/record/page.tsx @@ -0,0 +1,99 @@ +'use client'; + +import Box from '@mui/material/Box'; +import Chip from '@mui/material/Chip'; +import List from '@mui/material/List'; +import ListItem from '@mui/material/ListItem'; +import ListItemButton from '@mui/material/ListItemButton'; +import ListItemText from '@mui/material/ListItemText'; +import Paper from '@mui/material/Paper'; +import Skeleton from '@mui/material/Skeleton'; +import Typography from '@mui/material/Typography'; +import Link from 'next/link'; +import { api } from '~/trpc/react'; + +export default function CongressionalRecordPage() { + const { data, isLoading } = api.congressionalRecord.list.useQuery({ + page: 1, + pageSize: 30, + }); + + return ( + + + Congressional Record + + + Browse daily transcripts of Congressional debates, speeches, and proceedings. + + + + {isLoading ? ( + + {Array.from({ length: 10 }).map((_, i) => ( + + ))} + + ) : ( + + {data?.issues?.map((issue) => { + const dateStr = issue.issue_date + ? new Date(issue.issue_date).toLocaleDateString('en-US', { + weekday: 'long', + year: 'numeric', + month: 'long', + day: 'numeric', + }) + : 'Unknown Date'; + + const sectionCounts: Record = {}; + for (const g of issue.crec_granule) { + const s = g.section ?? 'Other'; + sectionCounts[s] = (sectionCounts[s] ?? 0) + 1; + } + + return ( + + + + {Object.entries(sectionCounts).map( + ([section, count]) => ( + + ), + )} + + } + /> + + + ); + })} + {data?.issues?.length === 0 && ( + + + + )} + + )} + + + ); +} diff --git a/hillstack/src/app/congress/record/stats/page.tsx b/hillstack/src/app/congress/record/stats/page.tsx new file mode 100644 index 0000000..f62d4d5 --- /dev/null +++ b/hillstack/src/app/congress/record/stats/page.tsx @@ -0,0 +1,139 @@ +'use client'; + +import Avatar from '@mui/material/Avatar'; +import Box from '@mui/material/Box'; +import Chip from '@mui/material/Chip'; +import Paper from '@mui/material/Paper'; +import Skeleton from '@mui/material/Skeleton'; +import Table from '@mui/material/Table'; +import TableBody from '@mui/material/TableBody'; +import TableCell from '@mui/material/TableCell'; +import TableContainer from '@mui/material/TableContainer'; +import TableHead from '@mui/material/TableHead'; +import TableRow from '@mui/material/TableRow'; +import Typography from '@mui/material/Typography'; +import Link from 'next/link'; +import { api } from '~/trpc/react'; + +function getPartyColor(party: string | null | undefined): string { + if (!party) return '#9e9e9e'; + const p = party.toLowerCase(); + if (p.startsWith('r')) return '#e53935'; + if (p.startsWith('d')) return '#1e88e5'; + return '#9e9e9e'; +} + +export default function SpeakerStatsPage() { + const { data, isLoading } = api.congressionalRecord.speakerStats.useQuery({ + limit: 50, + page: 1, + }); + + return ( + + + Speaker Statistics + + + Top Congressional speakers by total word count in the Congressional Record. + + + + + + + Rank + Legislator + Party + State + Total Words + Speeches + Avg Words/Speech + + + + {isLoading + ? Array.from({ length: 20 }).map((_, i) => ( + + + + + + + + + + )) + : data?.speakers?.map((speaker, idx) => { + const avg = speaker.speech_count > 0 + ? Math.round(speaker.total_words / speaker.speech_count) + : 0; + + return ( + + {idx + 1} + + + {speaker.image_url ? ( + + ) : ( + + {(speaker.first_name?.[0] ?? '') + + (speaker.last_name?.[0] ?? '')} + + )} + + {speaker.first_name} {speaker.last_name} + + + + + + + {speaker.state ?? '-'} + + {speaker.total_words.toLocaleString()} + + + {speaker.speech_count.toLocaleString()} + + + {avg.toLocaleString()} + + + ); + })} + +
+
+ + {!isLoading && data?.speakers?.length === 0 && ( + + No speaker data available yet. Run the Congressional Record importer first. + + )} +
+ ); +} diff --git a/hillstack/src/components/record/LegislatorSpeakingStats.tsx b/hillstack/src/components/record/LegislatorSpeakingStats.tsx new file mode 100644 index 0000000..7bcf2dd --- /dev/null +++ b/hillstack/src/components/record/LegislatorSpeakingStats.tsx @@ -0,0 +1,155 @@ +'use client'; + +import ArticleOutlinedIcon from '@mui/icons-material/ArticleOutlined'; +import Box from '@mui/material/Box'; +import Card from '@mui/material/Card'; +import Chip from '@mui/material/Chip'; +import List from '@mui/material/List'; +import ListItem from '@mui/material/ListItem'; +import ListItemButton from '@mui/material/ListItemButton'; +import Toolbar from '@mui/material/Toolbar'; +import Typography from '@mui/material/Typography'; +import Link from 'next/link'; +import { api } from '~/trpc/react'; + +export function LegislatorSpeakingStats({ + bioguideId, +}: { + bioguideId: string; +}) { + const { data, isLoading } = api.congressionalRecord.legislatorStats.useQuery({ + bioguideId, + }); + + if (isLoading || !data || data.speechCount === 0) { + return null; + } + + return ( + + + + Congressional Record Activity + + + + + + {data.totalWords.toLocaleString()} + + + Total Words + + + + + {data.speechCount.toLocaleString()} + + + Speeches + + + {data.speechCount > 0 && ( + + + {Math.round(data.totalWords / data.speechCount).toLocaleString()} + + + Avg Words/Speech + + + )} + + + + {data.recentSpeeches.length > 0 && ( + <> + + + Recent Speeches + + + + {data.recentSpeeches.map((speech) => { + const granule = speech.crec_granule; + const issueDate = granule?.crec_issue?.issue_date; + const dateStr = issueDate + ? new Date(issueDate).toLocaleDateString('en-US', { + month: 'short', + day: 'numeric', + year: 'numeric', + }) + : ''; + + return ( + + + + + + {granule?.title ?? 'Speech'} + + + + + {speech.word_count?.toLocaleString()} words + + + + + ); + })} + + + )} + + ); +} diff --git a/hillstack/src/components/record/SpeechCard.tsx b/hillstack/src/components/record/SpeechCard.tsx new file mode 100644 index 0000000..45ba78c --- /dev/null +++ b/hillstack/src/components/record/SpeechCard.tsx @@ -0,0 +1,117 @@ +import Avatar from '@mui/material/Avatar'; +import Box from '@mui/material/Box'; +import Chip from '@mui/material/Chip'; +import Paper from '@mui/material/Paper'; +import Tooltip from '@mui/material/Tooltip'; +import Typography from '@mui/material/Typography'; +import Link from 'next/link'; +import type { inferRouterOutputs } from '@trpc/server'; +import type { AppRouter } from '~/server/api/root'; + +type GranuleOutput = inferRouterOutputs['congressionalRecord']['getGranule']; +type SpeechType = GranuleOutput['crec_speech'][number]; + +function getPartyColor(party: string | null | undefined): string { + if (!party) return 'default'; + const p = party.toLowerCase(); + if (p.startsWith('r')) return '#e53935'; + if (p.startsWith('d')) return '#1e88e5'; + return '#9e9e9e'; +} + +export function SpeechCard({ speech }: { speech: SpeechType }) { + const legislator = speech.legislator; + const speakerName = legislator + ? `${legislator.first_name ?? ''} ${legislator.last_name ?? ''}`.trim() + : speech.speaker_raw || 'Unknown Speaker'; + + const partyColor = getPartyColor(legislator?.party); + + return ( + + + {legislator?.image_url ? ( + + ) : ( + + {speakerName[0] ?? '?'} + + )} + + + {speech.legislator_bioguide_id ? ( + + + {speakerName} + + + ) : ( + + {speakerName} + + )} + {legislator && ( + + {legislator.party ?? ''} - {legislator.state ?? ''} + + )} + + + {speech.word_count != null && ( + + )} + + + + {speech.content_text} + + + {speech.crec_bill_reference.length > 0 && ( + + {speech.crec_bill_reference.map((ref) => ( + + {ref.legislation_id ? ( + + ) : ( + + )} + + ))} + + )} + + ); +} diff --git a/hillstack/src/constants/navigation.tsx b/hillstack/src/constants/navigation.tsx index e38664e..83ef938 100644 --- a/hillstack/src/constants/navigation.tsx +++ b/hillstack/src/constants/navigation.tsx @@ -1,4 +1,5 @@ import AccountBalanceOutlinedIcon from '@mui/icons-material/AccountBalanceOutlined'; +import ArticleOutlinedIcon from '@mui/icons-material/ArticleOutlined'; import AutoGraphIcon from '@mui/icons-material/AutoGraph'; import ChevronLeftOutlinedIcon from '@mui/icons-material/ChevronLeftOutlined'; import Diversity2Icon from '@mui/icons-material/Diversity2'; @@ -48,9 +49,15 @@ export const congressTabs: NavigationTabs = { icon: , label: 'Insights', }, + '/congress/record': { + id: 6, + icon: , + label: 'Record', + }, }; import DifferenceIcon from '@mui/icons-material/Difference'; +import ForumIcon from '@mui/icons-material/Forum'; import LocalAtmIcon from '@mui/icons-material/LocalAtm'; import ManageSearchIcon from '@mui/icons-material/ManageSearch'; import SmartButtonIcon from '@mui/icons-material/SmartButton'; @@ -82,5 +89,10 @@ export const congressBillTabs = ({ icon: , label: 'Spending', }, + [`/congress/bills/${params.billId}/debates`]: { + id: 4, + icon: , + label: 'Debates', + }, }; }; diff --git a/hillstack/src/server/api/root.ts b/hillstack/src/server/api/root.ts index c2ba5ca..af51a3f 100644 --- a/hillstack/src/server/api/root.ts +++ b/hillstack/src/server/api/root.ts @@ -1,4 +1,5 @@ import { billRouter } from '~/server/api/routers/bill'; +import { congressionalRecordRouter } from '~/server/api/routers/congressionalRecord'; import { statsRouter } from '~/server/api/routers/stats'; import { userRouter } from '~/server/api/routers/user'; import { createCallerFactory, createTRPCRouter } from '~/server/api/trpc'; @@ -12,6 +13,7 @@ import { legislatorRouter } from './routers/legislator'; */ export const appRouter = createTRPCRouter({ bill: billRouter, + congressionalRecord: congressionalRecordRouter, legislator: legislatorRouter, committee: committeeRouter, stats: statsRouter, diff --git a/hillstack/src/server/api/routers/congressionalRecord.ts b/hillstack/src/server/api/routers/congressionalRecord.ts new file mode 100644 index 0000000..610ae3e --- /dev/null +++ b/hillstack/src/server/api/routers/congressionalRecord.ts @@ -0,0 +1,407 @@ +import { z } from 'zod'; +import { createTRPCRouter, publicProcedure } from '~/server/api/trpc'; + +export const congressionalRecordRouter = createTRPCRouter({ + list: publicProcedure + .input( + z.object({ + page: z.number().default(1), + pageSize: z.number().default(20), + startDate: z.string().optional(), + endDate: z.string().optional(), + }), + ) + .query(async ({ input, ctx }) => { + const { page, pageSize, startDate, endDate } = input; + + const where: Record = {}; + if (startDate || endDate) { + where.issue_date = {}; + if (startDate) (where.issue_date as Record).gte = new Date(startDate); + if (endDate) (where.issue_date as Record).lte = new Date(endDate); + } + + const [issues, totalResults] = await Promise.all([ + ctx.db.crec_issue.findMany({ + select: { + crec_issue_id: true, + issue_date: true, + congress_id: true, + package_id: true, + crec_granule: { + select: { + crec_granule_id: true, + section: true, + }, + }, + }, + where, + orderBy: { issue_date: 'desc' }, + skip: (page - 1) * pageSize, + take: pageSize, + }), + ctx.db.crec_issue.count({ where }), + ]); + + return { issues, totalResults }; + }), + + getIssue: publicProcedure + .input(z.object({ issueId: z.number() })) + .query(async ({ input, ctx }) => { + const issue = await ctx.db.crec_issue.findUniqueOrThrow({ + select: { + crec_issue_id: true, + issue_date: true, + congress_id: true, + package_id: true, + crec_granule: { + select: { + crec_granule_id: true, + granule_id: true, + section: true, + title: true, + page_start: true, + page_end: true, + order_number: true, + crec_summary: { + select: { + summary: true, + }, + where: { + summary_type: 'granule', + }, + take: 1, + }, + _count: { + select: { + crec_speech: true, + }, + }, + }, + orderBy: { order_number: 'asc' }, + }, + crec_summary: { + select: { + summary: true, + summary_type: true, + }, + where: { + summary_type: 'daily', + }, + }, + }, + where: { crec_issue_id: input.issueId }, + }); + + return issue; + }), + + getGranule: publicProcedure + .input(z.object({ granuleId: z.number() })) + .query(async ({ input, ctx }) => { + const granule = await ctx.db.crec_granule.findUniqueOrThrow({ + select: { + crec_granule_id: true, + crec_issue_id: true, + granule_id: true, + section: true, + title: true, + page_start: true, + page_end: true, + crec_speech: { + select: { + crec_speech_id: true, + speaker_raw: true, + legislator_bioguide_id: true, + order_number: true, + content_text: true, + word_count: true, + legislator: { + select: { + first_name: true, + last_name: true, + party: true, + state: true, + image_url: true, + }, + }, + crec_bill_reference: { + select: { + crec_bill_reference_id: true, + legislation_id: true, + cite_text: true, + cite_type: true, + start_offset: true, + end_offset: true, + legislation: { + select: { + legislation_id: true, + title: true, + number: true, + chamber: true, + }, + }, + }, + }, + }, + orderBy: { order_number: 'asc' }, + }, + crec_summary: { + select: { + summary: true, + }, + where: { + summary_type: 'granule', + }, + take: 1, + }, + }, + where: { crec_granule_id: input.granuleId }, + }); + + return granule; + }), + + speakerStats: publicProcedure + .input( + z.object({ + startDate: z.string().optional(), + endDate: z.string().optional(), + chamber: z.string().optional(), + limit: z.number().default(20), + page: z.number().default(1), + }), + ) + .query(async ({ input, ctx }) => { + const { startDate, endDate, chamber, limit, page } = input; + + // Build date filter for the join through granule -> issue + const granuleWhere: Record = {}; + if (chamber) { + granuleWhere.section = chamber; + } + if (startDate || endDate) { + const dateFilter: Record = {}; + if (startDate) dateFilter.gte = new Date(startDate); + if (endDate) dateFilter.lte = new Date(endDate); + granuleWhere.crec_issue = { issue_date: dateFilter }; + } + + // Use raw SQL for aggregation since Prisma doesn't support groupBy with sum well + const dateConditions: string[] = []; + const params: unknown[] = []; + let paramIdx = 1; + + if (startDate) { + dateConditions.push(`ci.issue_date >= $${paramIdx}::date`); + params.push(startDate); + paramIdx++; + } + if (endDate) { + dateConditions.push(`ci.issue_date <= $${paramIdx}::date`); + params.push(endDate); + paramIdx++; + } + if (chamber) { + dateConditions.push(`cg.section = $${paramIdx}::text`); + params.push(chamber); + paramIdx++; + } + + const whereClause = dateConditions.length > 0 + ? `AND ${dateConditions.join(' AND ')}` + : ''; + + const offset = (page - 1) * limit; + params.push(limit, offset); + + const results = await ctx.db.$queryRawUnsafe>( + `SELECT + cs.legislator_bioguide_id as bioguide_id, + l.first_name, + l.last_name, + l.party, + l.state, + l.image_url, + SUM(cs.word_count)::bigint as total_words, + COUNT(cs.crec_speech_id)::bigint as speech_count + FROM crec_speech cs + JOIN crec_granule cg ON cs.crec_granule_id = cg.crec_granule_id + JOIN crec_issue ci ON cg.crec_issue_id = ci.crec_issue_id + JOIN legislator l ON cs.legislator_bioguide_id = l.bioguide_id + WHERE cs.legislator_bioguide_id IS NOT NULL ${whereClause} + GROUP BY cs.legislator_bioguide_id, l.first_name, l.last_name, l.party, l.state, l.image_url + ORDER BY total_words DESC + LIMIT $${paramIdx} OFFSET $${paramIdx + 1}`, + ...params, + ); + + const countResult = await ctx.db.$queryRawUnsafe>( + `SELECT COUNT(DISTINCT cs.legislator_bioguide_id)::bigint as count + FROM crec_speech cs + JOIN crec_granule cg ON cs.crec_granule_id = cg.crec_granule_id + JOIN crec_issue ci ON cg.crec_issue_id = ci.crec_issue_id + WHERE cs.legislator_bioguide_id IS NOT NULL ${whereClause}`, + ...params.slice(0, params.length - 2), + ); + + return { + speakers: results.map((r) => ({ + ...r, + total_words: Number(r.total_words), + speech_count: Number(r.speech_count), + })), + totalResults: Number(countResult[0]?.count ?? 0), + }; + }), + + activityCalendar: publicProcedure + .input( + z.object({ + startDate: z.string().optional(), + endDate: z.string().optional(), + }).optional(), + ) + .query(async ({ input, ctx }) => { + const where: Record = {}; + if (input?.startDate || input?.endDate) { + where.issue_date = {}; + if (input?.startDate) (where.issue_date as Record).gte = new Date(input.startDate); + if (input?.endDate) (where.issue_date as Record).lte = new Date(input.endDate); + } + + const issues = await ctx.db.crec_issue.findMany({ + select: { + issue_date: true, + _count: { + select: { + crec_granule: true, + }, + }, + }, + where, + orderBy: { issue_date: 'asc' }, + }); + + return issues.map((i) => ({ + date: i.issue_date, + count: i._count.crec_granule, + })); + }), + + // For bill detail page - get debates referencing a specific bill + debatesForBill: publicProcedure + .input(z.object({ legislationId: z.number() })) + .query(async ({ input, ctx }) => { + const references = await ctx.db.crec_bill_reference.findMany({ + select: { + cite_text: true, + crec_speech: { + select: { + crec_speech_id: true, + speaker_raw: true, + legislator_bioguide_id: true, + content_text: true, + word_count: true, + legislator: { + select: { + first_name: true, + last_name: true, + party: true, + }, + }, + crec_granule: { + select: { + crec_granule_id: true, + crec_issue_id: true, + title: true, + section: true, + crec_issue: { + select: { + issue_date: true, + }, + }, + }, + }, + }, + }, + }, + where: { + legislation_id: input.legislationId, + }, + take: 50, + }); + + return references; + }), + + // For legislator page - get speaking stats for a specific legislator + legislatorStats: publicProcedure + .input(z.object({ bioguideId: z.string() })) + .query(async ({ input, ctx }) => { + const stats = await ctx.db.$queryRawUnsafe>( + `SELECT + SUM(cs.word_count)::bigint as total_words, + COUNT(cs.crec_speech_id)::bigint as speech_count, + MIN(ci.issue_date) as first_date, + MAX(ci.issue_date) as last_date + FROM crec_speech cs + JOIN crec_granule cg ON cs.crec_granule_id = cg.crec_granule_id + JOIN crec_issue ci ON cg.crec_issue_id = ci.crec_issue_id + WHERE cs.legislator_bioguide_id = $1`, + input.bioguideId, + ); + + const recentSpeeches = await ctx.db.crec_speech.findMany({ + select: { + crec_speech_id: true, + content_text: true, + word_count: true, + crec_granule: { + select: { + crec_granule_id: true, + crec_issue_id: true, + title: true, + section: true, + crec_issue: { + select: { + issue_date: true, + }, + }, + }, + }, + }, + where: { + legislator_bioguide_id: input.bioguideId, + }, + orderBy: { + crec_speech_id: 'desc', + }, + take: 10, + }); + + const stat = stats[0]; + return { + totalWords: Number(stat?.total_words ?? 0), + speechCount: Number(stat?.speech_count ?? 0), + firstDate: stat?.first_date ?? null, + lastDate: stat?.last_date ?? null, + recentSpeeches, + }; + }), +});