diff --git a/mkdocs.yml b/mkdocs.yml
index c81a99bc26..de7ed1704e 100644
--- a/mkdocs.yml
+++ b/mkdocs.yml
@@ -47,6 +47,7 @@ nav:
           - Coordinate systems: appendices/coordinate-systems.md
           - Quantitative MRI: appendices/qmri.md
           - Arterial Spin Labeling: appendices/arterial-spin-labeling.md
+          - Media files: appendices/media-files.md
           - Cross modality correspondence: appendices/cross-modality-correspondence.md
       - Changelog: CHANGES.md
   - The BIDS Website:
diff --git a/src/appendices/media-files.md b/src/appendices/media-files.md
new file mode 100644
index 0000000000..62759f3dfb
--- /dev/null
+++ b/src/appendices/media-files.md
@@ -0,0 +1,165 @@
+# Media Files
+
+## Introduction
+
+Several BIDS datatypes make use of media files — audio recordings, video recordings,
+combined audio-video recordings, and still images.
+This appendix defines the common file formats, metadata conventions,
+and codec identification schemes shared across all datatypes that use media files.
+
+The following media suffixes are defined:
+
+{{ MACROS___make_suffix_table(["audio", "video", "audiovideo", "image"]) }}
+
+Datatypes that incorporate media files (for example, behavioral recordings or stimuli)
+define their own file-naming rules, directory placement, and datatype-specific metadata.
+The conventions described here apply uniformly to all such datatypes.
+
+### Relationship to the `photo` suffix
+
+The media file definitions introduced here generalize the concept of all media in BIDS.
+The existing `photo` suffix (used for photographs of anatomical landmarks,
+head localization coils, and tissue samples) predates this framework and covers
+a narrower use case — still images in specific electrophysiology and microscopy datatypes.
+
+The media suffixes (`audio`, `video`, `audiovideo`, `image`) are intended as the
+general-purpose mechanism for all media content in BIDS.
+The media file framework should be generally adopted for new datatypes,
+and a future proposal may deprecate the `photo` suffix in favor of the broader `image`
+suffix with appropriate migration tooling
+(see [bids-utils](https://github.com/bids-standard/bids-utils)).
+
+## Supported Formats
+
+### Audio formats
+
+{{ MACROS___make_extension_table(["wav", "mp3", "aac", "ogg"]) }}
+
+### Video container formats
+
+{{ MACROS___make_extension_table(["mp4", "avi", "mkv", "webm"]) }}
+
+### Image formats
+
+{{ MACROS___make_extension_table(["jpg", "png", "svg", "webp", "tif", "tiff"]) }}
+
+When choosing a format, consider the trade-off between file size, data fidelity, openness and prevalence of the format in the domain of application.
+Uncompressed or lossless formats (WAV, PNG, TIFF) preserve full quality
+but produce larger files.
+Lossy formats (MP3, AAC, JPEG) significantly reduce file size
+at the cost of some data loss.
+
+## Media Stream Metadata
+
+Media files SHOULD be accompanied by a JSON sidecar file
+containing technical metadata about the media streams.
+The following metadata fields are defined for media files.
+
+### Duration
+
+Applies to suffixes: `audio`, `video`, `audiovideo`.
+
+{{ MACROS___make_sidecar_table("media.MediaDuration") }}
+
+`RecordingDuration` reuses the existing BIDS metadata field already defined for
+electrophysiology recordings (EEG, iEEG, MEG, and others).
+
+### Audio stream properties
+
+Applies to suffixes: `audio`, `audiovideo`.
+
+{{ MACROS___make_sidecar_table("media.MediaAudioProperties") }}
+
+Note: `AudioSampleRate` is used instead of the existing `SamplingFrequency` field
+because audio-video files require distinguishing the audio sampling rate from the
+video frame rate. The `Audio` prefix makes this unambiguous in multi-stream containers.
+
+### Image properties
+
+Applies to suffixes: `video`, `audiovideo`, `image`.
+
+{{ MACROS___make_sidecar_table("media.MediaImageProperties") }}
+
+### Video stream properties
+
+Applies to suffixes: `video`, `audiovideo`.
+
+{{ MACROS___make_sidecar_table("media.MediaVideoProperties") }}
+
+## Codec Identification
+
+Codec identification uses two complementary naming systems:
+
+### FFmpeg codec names (RECOMMENDED)
+
+The `AudioCodec` and `VideoCodec` fields use
+[FFmpeg codec names](https://www.ffmpeg.org/ffmpeg-codecs.html) as the RECOMMENDED
+convention. These names are the de facto standard in scientific computing and can be
+auto-extracted from media files using:
+
+```bash
+ffprobe -v quiet -print_format json -show_streams <file>
+```
+
+### RFC 6381 codec strings (OPTIONAL)
+
+The `AudioCodecRFC6381` and `VideoCodecRFC6381` fields use
+[RFC 6381](https://datatracker.ietf.org/doc/html/rfc6381) codec strings.
+These provide precise codec profile and level information useful for
+web and broadcast interoperability.
+
+### Common codec reference
+
+| Codec          | FFmpeg Name | RFC 6381 String    | Notes                   |
+| -------------- | ----------- | ------------------ | ----------------------- |
+| H.264 / AVC    | `h264`      | `avc1.640028`      | Most widely supported   |
+| H.265 / HEVC   | `hevc`      | `hev1.1.6.L93.B0`  | High efficiency         |
+| VP9            | `vp9`       | `vp09.00.10.08`    | Open, royalty-free      |
+| AV1            | `av1`       | `av01.0.01M.08`    | Next-gen open codec     |
+| AAC-LC         | `aac`       | `mp4a.40.2`        | Default audio for MP4   |
+| MP3            | `mp3`       | `mp4a.6B`          | Legacy lossy audio      |
+| Opus           | `opus`      | `Opus`             | Open, low-latency audio |
+| FLAC           | `flac`      | `fLaC`             | Open lossless audio     |
+| PCM 16-bit LE  | `pcm_s16le` | —                  | Uncompressed (WAV)      |
+
+The FFmpeg name column shows the value to use for `VideoCodec` or `AudioCodec`.
+The RFC 6381 column shows the value for `VideoCodecRFC6381` or `AudioCodecRFC6381`.
+RFC 6381 strings vary by profile and level;
+the values shown are representative examples.
+
+## Privacy Considerations
+
+Media files — particularly audio and video recordings — may contain
+personally identifiable information (PII), including but not limited to:
+
+-   Voices and speech content
+-   Facial features and other physical characteristics
+-   Background environments that could identify locations
+-   Metadata embedded in file headers (for example, GPS coordinates, device identifiers)
+
+Researchers MUST ensure that sharing of media files complies with the
+informed consent obtained from participants and with applicable privacy regulations.
+De-identification techniques (for example, voice distortion, face blurring,
+metadata stripping) SHOULD be applied where appropriate before data sharing.
+
+## Example
+
+A complete sidecar JSON file for an audio-video recording:
+
+```json
+{
+    "RecordingDuration": 312.5,
+    "VideoCodec": "h264",
+    "VideoCodecRFC6381": "avc1.640028",
+    "VideoFrameRate": 30,
+    "VideoFrameCount": 9375,
+    "ImageWidth": 1920,
+    "ImageHeight": 1080,
+    "ImagePixelFormat": "yuv420p",
+    "ImageBitDepth": 8,
+    "AudioCodec": "aac",
+    "AudioCodecRFC6381": "mp4a.40.2",
+    "AudioSampleRate": 48000,
+    "AudioChannelCount": 2
+}
+```
diff --git a/src/modality-specific-files/behavioral-experiments.md b/src/modality-specific-files/behavioral-experiments.md
index 81af91e65c..59b9e80a73 100644
--- a/src/modality-specific-files/behavioral-experiments.md
+++ b/src/modality-specific-files/behavioral-experiments.md
@@ -1,4 +1,4 @@
-# Behavioral experiments (with no neural recordings)
+# Behavioral recordings
 
 !!! example "Example datasets"
 
@@ -15,19 +15,14 @@ and a guide for using macros can be found at
 -->
 {{ MACROS___make_filename_template("raw", datatypes=["beh"]) }}
 
-In addition to logs from behavioral experiments
-performed alongside imaging data acquisitions,
-one MAY also include data from experiments
-performed with no neural recordings.
-The results of those experiments MAY be stored in the `beh` directory
-using the same formats for event timing (`_events.tsv`),
-metadata (`_events.json`),
-physiological (`_physio.tsv.gz`, `_physio.json`)
-and other continuous recordings (`_stim.tsv.gz`, `_stim.json`)
-as for tasks performed during MRI, electrophysiological or other neural recordings.
-Additionally, events files
-that do not include the mandatory `onset` and `duration` columns
-MAY be included,
+The `beh` directory MAY store behavioral recordings such as audio (`_audio.*`), video (`_video.*`), combined audio-video (`_audiovideo.*`), and still image (`_image.*`) recordings, physiological (`_physio.*`) recordings, and other continuous recordings (`_stim.tsv.gz`, `_stim.json`).
+Audio, video, audio-video, and image recordings MAY be of subjects performing tasks, resting-state behavior, or recordings of stimuli being presented to the subject.
+Audio/video recordings MAY occur simultaneously with other recordings, such as BOLD or EEG.
+Relative timing between files may be determined by consulting the `scans.tsv` file.
+If no `scans.tsv` file is present, the alignment is undefined.
+The `beh` directory MAY also contain event timing files (`_events.tsv`) and their associated metadata (`_events.json`) for behavioral experiments that do not have corresponding neuroimaging or functional data.
+
+Additionally, events files that do not include the mandatory `onset` and `duration` columns MAY be included,
 but MUST be labeled `_beh.tsv` rather than `_events.tsv`.
 
 The following OPTIONAL columns are pre-defined for behavioral data files:
@@ -76,6 +71,234 @@ A guide for using macros can be found at
 -->
 {{ MACROS___make_sidecar_table("beh.BEHInstitutionInformation") }}
 
+## Audio, video, and audio-video recordings and images
+
+Audio and video recordings of behaving subjects MAY be stored in the `beh` directory
+using the `_audio`, `_video`, and `_audiovideo` suffixes.
+The `_audio` suffix is for audio-only recordings, `_video` for video-only recordings,
+and `_audiovideo` for recordings that contain both audio and video streams.
+These recordings are typically used to capture vocalizations, speech, facial expressions,
+body movements, or other behavioral aspects during experimental tasks or rest periods.
+
+Still images captured during behavioral experiments MAY be stored in the `beh` directory
+using the `_image` suffix.
+These images are typically used for training frames for pose estimation,
+snapshots of behavioral setups, or individual frames extracted from video recordings.
+
+!!! warning "Privacy and personally identifiable information"
+
+    Audio and video recordings and images of human subjects often contain personally identifiable
+    information (PII) such as faces, voices, and other identifying features.
+    Data curators MUST take special care to ensure compliance with applicable privacy
+    regulations (such as HIPAA in the United States, GDPR in the European Union, or other
+    local data protection laws) when handling these recordings.
+
+    These recordings are generally more suitable for internal use or for sharing
+    non-human subject data, unless appropriate privacy protections are implemented.
+
+### File formats
+
+Audio recordings MUST use one of the following extensions:
+
+-   `.flac` - Free Lossless Audio Codec
+-   `.mp3` - MPEG Audio Layer III
+-   `.ogg` - Ogg Vorbis
+-   `.wav` - Waveform Audio File Format
+
+Video and audio-video recordings MUST use one of the following extensions:
+
+-   `.mp4` - MPEG-4 Part 14
+-   `.mkv` - Matroska video container
+-   `.avi` - Audio Video Interleave
+
+Image files MUST use one of the following extensions:
+
+-   `.jpg` - JPEG image
+-   `.png` - Portable Network Graphics
+
+### Entities
+
+Audio and video files MAY use the following entities:
+
+-   `task` - OPTIONAL for audio and video recordings
+-   `acq` - OPTIONAL, can distinguish different recording setups
+-   `run` - OPTIONAL, for multiple recordings with identical parameters
+-   `recording` - OPTIONAL, to differentiate simultaneous recordings from different angles, locations, or devices
+-   `split` - OPTIONAL, for continuous recordings split into multiple files
+
+### Examples
+
+<!-- This block generates a file tree.
+A guide for using macros can be found at
+ https://github.com/bids-standard/bids-specification/blob/master/macros_doc.md
+-->
+{{ MACROS___make_filetree_example(
+   {
+   "sub-01": {
+      "beh": {
+         "sub-01_task-rest_video.mp4": "",
+         "sub-01_task-rest_video.json": "",
+         "sub-01_task-interview_audiovideo.mp4": "",
+         "sub-01_task-interview_audiovideo.json": "",
+         "sub-01_task-stroop_recording-face_video.mp4": "",
+         "sub-01_task-stroop_recording-face_video.json": "",
+         "sub-01_task-stroop_recording-room_video.mp4": "",
+         "sub-01_task-stroop_recording-room_video.json": "",
+         "sub-01_task-rest_image.jpg": "",
+         "sub-01_task-rest_image.json": "",
+         "sub-01_task-vocalization_audio.wav": "",
+         "sub-01_task-vocalization_audio.json": "",
+         },
+      },
+   }
+) }}
+
+For continuous recordings split into multiple files:
+
+<!-- This block generates a file tree.
+A guide for using macros can be found at
+ https://github.com/bids-standard/bids-specification/blob/master/macros_doc.md
+-->
+{{ MACROS___make_filetree_example(
+   {
+   "sub-01": {
+      "ses-01": {
+         "beh": {
+            "sub-01_ses-01_task-freeplay_run-01_split-001_video.mp4": "",
+            "sub-01_ses-01_task-freeplay_run-01_split-002_video.mp4": "",
+            "sub-01_ses-01_task-freeplay_run-01_split-003_video.mp4": "",
+            "sub-01_ses-01_task-freeplay_run-01_video.json": "",
+            },
+         },
+      },
+   }
+) }}
+
+### Sidecar JSON for audio, video, audio-video recordings, and images
+
+The following metadata fields are available for audio, video, audio-video recordings, and images:
+
+<!-- This block generates a metadata table.
+These tables are defined in
+  src/schema/rules/sidecars
+The definitions of the fields specified in these tables may be found in
+  src/schema/objects/metadata.yaml
+A guide for using macros can be found at
+ https://github.com/bids-standard/bids-specification/blob/master/macros_doc.md
+-->
+{{ MACROS___make_sidecar_table("beh.AudioVideoImageDevice") }}
+
+!!! note "Licensing for recordings containing participants"
+
+    Audio, video, and image recordings of participants may have different licensing
+    restrictions than the main dataset due to privacy considerations. The optional
+    `License` field can be used to specify different terms for individual recordings
+    that contain identifiable participant data. If not specified, the recording
+    inherits the license from `dataset_description.json`.
+
+{{ MACROS___make_sidecar_table("beh.AudioVideoDuration") }}
+
+The following fields are available for audio recordings (`_audio`) and audio-video recordings (`_audiovideo`):
+
+{{ MACROS___make_sidecar_table("beh.AudioStreams") }}
+
+The following fields are available for video recordings (`_video`) and audio-video recordings (`_audiovideo`):
+
+{{ MACROS___make_sidecar_table("beh.VideoStreams") }}
+
+The following fields are available for image files (`_image`):
+
+{{ MACROS___make_sidecar_table("beh.AudioVideoImageDevice") }}
+
+{{ MACROS___make_sidecar_table("beh.ImageProperties") }}
+
+### Example audio-video sidecar JSON
+
+For an audio-video file containing both video and audio streams:
+
+```JSON
+{
+  "TaskName": "RestingState",
+  "Device": "Sony FDR-AX53",
+  "AudioChannelCount": 2,
+  "AudioSampleRate": 48000,
+  "FrameRate": 30.0,
+  "Height": 1080,
+  "Width": 1920,
+  "Duration": 600.5
+}
+```
+
+### Example video sidecar JSON
+
+For a video-only recording:
+
+```JSON
+{
+  "TaskName": "RestingState",
+  "Device": "Sony FDR-AX53",
+  "FrameRate": 30.0,
+  "Height": 1080,
+  "Width": 1920,
+  "Duration": 600.5
+}
+```
+
+### Example audio sidecar JSON
+
+For an audio-only recording:
+
+```JSON
+{
+  "TaskName": "Vocalization",
+  "Device": "Zoom H6 Handy Recorder",
+  "AudioChannelCount": 2,
+  "AudioSampleRate": 44100,
+  "Duration": 300.2
+}
+```
+
+### Example image sidecar JSON
+
+For a still image:
+
+```JSON
+{
+  "TaskName": "Reaching",
+  "Device": "GoPro Hero 10",
+  "Height": 1080,
+  "Width": 1920,
+  "CameraPosition": "overhead"
+}
+```
+
+### Annotations and events
+
+Behavioral annotations or event markers for audio and video recordings
+SHOULD be stored in accompanying `_events.tsv` files following the standard
+[events file format](../modality-agnostic-files/events.md).
+These events files use the same filename entities as the audio/video file they describe,
+but with the `_events` suffix.
+
+For example:
+
+<!-- This block generates a file tree.
+A guide for using macros can be found at
+ https://github.com/bids-standard/bids-specification/blob/master/macros_doc.md
+-->
+{{ MACROS___make_filetree_example(
+   {
+   "sub-01": {
+      "beh": {
+         "sub-01_task-speech_audio.wav": "",
+         "sub-01_task-speech_audio.json": "",
+         "sub-01_task-speech_events.tsv": "",
+         "sub-01_task-speech_events.json": "",
+         },
+      },
+   }
+) }}
+
 ## Example `_beh.tsv`
 
 ```tsv
diff --git a/src/schema/objects/extensions.yaml b/src/schema/objects/extensions.yaml
index 3c7ef248fa..3032d4f92b 100644
--- a/src/schema/objects/extensions.yaml
+++ b/src/schema/objects/extensions.yaml
@@ -1,5 +1,11 @@
 ---
 # This file describes valid file extensions in the specification.
+aac:
+  value: .aac
+  display_name: Advanced Audio Coding
+  description: |
+    An [Advanced Audio Coding](https://en.wikipedia.org/wiki/Advanced_Audio_Coding)
+    audio file.
 ave:
   value: .ave
   display_name: AVE # not sure what ave stands for
@@ -7,6 +13,12 @@ ave:
     File containing data averaged by segments of interest.
 
     Used by KIT, Yokogawa, and Ricoh MEG systems.
+avi:
+  value: .avi
+  display_name: Audio Video Interleave
+  description: |
+    An [Audio Video Interleave](https://en.wikipedia.org/wiki/Audio_Video_Interleave)
+    media container file.
 bdf:
   value: .bdf
   display_name: Biosemi Data Format
@@ -114,6 +126,12 @@ fif:
   display_name: Functional Imaging File Format
   description: |
     An MEG file format used by Neuromag, Elekta, and MEGIN.
+flac:
+  value: .flac
+  display_name: Free Lossless Audio Codec
+  description: |
+    A [FLAC](https://en.wikipedia.org/wiki/FLAC) audio file.
+    This format is commonly used for behavioral audio recordings.
 jpg:
   value: .jpg
   display_name: Joint Photographic Experts Group Format
@@ -153,6 +171,22 @@ md:
   display_name: Markdown
   description: |
     A Markdown file.
+mkv:
+  value: .mkv
+  display_name: Matroska Video
+  description: |
+    A [Matroska](https://www.matroska.org/) media container file.
+mp3:
+  value: .mp3
+  display_name: MP3 Audio
+  description: |
+    An [MP3](https://en.wikipedia.org/wiki/MP3) audio file.
+mp4:
+  value: .mp4
+  display_name: MPEG-4 Part 14
+  description: |
+    An [MPEG-4 Part 14](https://en.wikipedia.org/wiki/MP4_file_format)
+    media container file.
 mefd:
   value: .mefd/
   display_name: Multiscale Electrophysiology File Format Version 3.0
@@ -201,6 +235,12 @@ nwb:
     A [Neurodata Without Borders](https://nwb-schema.readthedocs.io/en/latest/) file.
 
     Each recording consists of a single `.nwb` file.
+ogg:
+  value: .ogg
+  display_name: Ogg Vorbis
+  description: |
+    An [Ogg](https://en.wikipedia.org/wiki/Ogg) audio file,
+    typically containing Vorbis-encoded audio.
 OMEBigTiff:
   value: .ome.btf
   display_name: Open Microscopy Environment BigTIFF
@@ -249,6 +289,11 @@ snirf:
   display_name: Shared Near Infrared Spectroscopy Format
   description: |
     HDF5 file organized according to the [SNIRF specification](https://github.com/fNIRS/snirf)
+svg:
+  value: .svg
+  display_name: Scalable Vector Graphics
+  description: |
+    A [Scalable Vector Graphics](https://en.wikipedia.org/wiki/SVG) image file.
 sqd:
   value: .sqd
   display_name: SQD
@@ -263,6 +308,12 @@ tif:
   display_name: Tag Image File Format
   description: |
     A [Tag Image File Format](https://en.wikipedia.org/wiki/TIFF) file.
+tiff:
+  value: .tiff
+  display_name: Tag Image File Format
+  description: |
+    A [Tag Image File Format](https://en.wikipedia.org/wiki/TIFF) image file.
+    The `.tiff` extension is the long form of `.tif`.
 trg:
   value: .trg
   display_name: KRISS TRG
@@ -307,6 +358,23 @@ vmrk:
     A text marker file in the
     [BrainVision Core Data Format](https://www.brainproducts.com/support-resources/brainvision-core-data-format-1-0/).
     These files come in three-file sets, including a `.vhdr`, a `.vmrk`, and a `.eeg` file.
+wav:
+  value: .wav
+  display_name: Waveform Audio
+  description: |
+    A [Waveform Audio File Format](https://en.wikipedia.org/wiki/WAV)
+    audio file, typically containing uncompressed PCM audio.
+webm:
+  value: .webm
+  display_name: WebM
+  description: |
+    A [WebM](https://www.webmproject.org/) media container file,
+    typically containing VP8/VP9 video and Vorbis/Opus audio.
+webp:
+  value: .webp
+  display_name: WebP Image
+  description: |
+    A [WebP](https://en.wikipedia.org/wiki/WebP) image file.
 Any:
   value: .*
   display_name: Any Extension
diff --git a/src/schema/objects/metadata.yaml b/src/schema/objects/metadata.yaml
index 347f6a53c0..747e081e9a 100644
--- a/src/schema/objects/metadata.yaml
+++ b/src/schema/objects/metadata.yaml
@@ -237,6 +237,42 @@ AttenuationCorrectionMethodReference:
   description: |
     Reference paper for the attenuation correction method used.
   type: string
+AudioChannelCount:
+  name: AudioChannelCount
+  display_name: Audio Channel Count
+  description: |
+    Number of audio channels in the audio or audio-video file
+    (for example, `1` for mono, `2` for stereo).
+  type: integer
+  minimum: 1
+AudioCodec:
+  name: AudioCodec
+  display_name: Audio Codec
+  description: |
+    The audio codec used to encode the audio stream, expressed as an
+    [FFmpeg codec name](https://www.ffmpeg.org/ffmpeg-codecs.html)
+    (for example, `"aac"`, `"mp3"`, `"opus"`, `"flac"`, `"pcm_s16le"`).
+    This value can be auto-extracted using
+    `ffprobe -v quiet -print_format json -show_streams`.
+  type: string
+AudioCodecRFC6381:
+  name: AudioCodecRFC6381
+  display_name: Audio Codec (RFC 6381)
+  description: |
+    The audio codec expressed as an
+    [RFC 6381](https://datatracker.ietf.org/doc/html/rfc6381) codec string
+    (for example, `"mp4a.40.2"` for AAC-LC).
+    This representation is useful for web and broadcast interoperability.
+  type: string
+AudioSampleRate:
+  name: AudioSampleRate
+  display_name: Audio Sample Rate
+  description: |
+    Sampling frequency of the audio stream, in Hz
+    (for example, `44100`, `48000`, `96000`).
+  type: number
+  exclusiveMinimum: 0
+  unit: Hz
 Authors:
   name: Authors
   display_name: Authors
@@ -923,6 +959,13 @@ Descriptors:
     - type: array
       items:
         type: string
+Device:
+  name: Device
+  display_name: Device
+  description: |
+    Free-form description of the device used to record the data
+    (for example, `"iPhone 12"`, `"Canon EOS R5"`).
+  type: string
 DeviceSerialNumber:
   name: DeviceSerialNumber
   display_name: Device Serial Number
@@ -1871,6 +1914,22 @@ ImageAcquisitionProtocol:
     [URI](SPEC_ROOT/common-principles.md#uniform-resource-indicator)
     (for example from [protocols.io](https://www.protocols.io/)).
   type: string
+ImageBitDepth:
+  name: ImageBitDepth
+  display_name: Image Bit Depth
+  description: |
+    Bit depth per channel of the stored pixel data of the video frame or
+    image (for example, `8`, `10`, `12`, `16`). For multi-channel data
+    this is the depth of each individual channel.
+    When `ImagePixelFormat` is also provided, this field is redundant with
+    the bit depth encoded in the FFmpeg `pix_fmt` value (for example,
+    `yuv420p10le` -> 10) and the two MUST agree. `ImageBitDepth` is
+    nonetheless useful as a more directly discoverable summary, and as the
+    primary precision field for image-only sidecars whose producing tools
+    do not naturally surface `pix_fmt`.
+  type: integer
+  minimum: 1
+  unit: bit
 ImageDecayCorrected:
   name: ImageDecayCorrected
   display_name: Image Decay Corrected
@@ -1885,6 +1944,38 @@ ImageDecayCorrectionTime:
     `"TimeZero"` in the default unit seconds.
   type: number
   unit: s
+ImageHeight:
+  name: ImageHeight
+  display_name: Image Height
+  description: |
+    Height of the video frame or image, in pixels.
+    Corresponds to the number of rows in the stored pixel grid as captured,
+    without applying any orientation correction that may be reported by
+    container metadata (for example, the EXIF `Orientation` tag).
+  type: integer
+  minimum: 1
+  unit: px
+ImagePixelFormat:
+  name: ImagePixelFormat
+  display_name: Image Pixel Format
+  description: |
+    The pixel format of the video frame or image, as reported by FFmpeg's
+    `pix_fmt` field (for example, `"yuv420p"`, `"yuv420p10le"`, `"gray16le"`,
+    `"rgb24"`). A single `pix_fmt` value encodes the color model, channel
+    count, chroma subsampling, and bit depth, and can be extracted
+    automatically with `ffprobe`.
+  type: string
+ImageWidth:
+  name: ImageWidth
+  display_name: Image Width
+  description: |
+    Width of the video frame or image, in pixels.
+    Corresponds to the number of columns in the stored pixel grid as captured,
+    without applying any orientation correction that may be reported by
+    container metadata (for example, the EXIF `Orientation` tag).
+  type: integer
+  minimum: 1
+  unit: px
 Immersion:
   name: Immersion
   display_name: Immersion
@@ -4499,6 +4590,46 @@ VisionCorrection:
     Equipment used to correct participant vision during an experiment.
     Example: "spectacles", "lenses", "none".
   type: string
+VideoCodec:
+  name: VideoCodec
+  display_name: Video Codec
+  description: |
+    The video codec used to encode the video stream, expressed as an
+    [FFmpeg codec name](https://www.ffmpeg.org/ffmpeg-codecs.html)
+    (for example, `"h264"`, `"hevc"`, `"vp9"`, `"av1"`).
+    This value can be auto-extracted using
+    `ffprobe -v quiet -print_format json -show_streams`.
+  type: string
+VideoCodecRFC6381:
+  name: VideoCodecRFC6381
+  display_name: Video Codec (RFC 6381)
+  description: |
+    The video codec expressed as an
+    [RFC 6381](https://datatracker.ietf.org/doc/html/rfc6381) codec string
+    (for example, `"avc1.640028"` for H.264 High Profile Level 4.0).
+    This representation is useful for web and broadcast interoperability.
+  type: string
+VideoFrameCount:
+  name: VideoFrameCount
+  display_name: Video Frame Count
+  description: |
+    Total number of frames in the video stream.
+    For constant frame rate video this can be derived from `VideoFrameRate`
+    and `RecordingDuration`, but for variable frame rate (VFR) video the
+    derivation is undefined, so an explicit value is needed.
+    Also useful as an integrity check to detect truncated or corrupted files.
+  type: integer
+  minimum: 1
+VideoFrameRate:
+  name: VideoFrameRate
+  display_name: Video Frame Rate
+  description: |
+    The video frame rate of the video stream, in Hz
+    (for example, `24`, `25`, `29.97`, `30`, `60`).
+    For variable rate videos, this value should be the nominal frame rate.
+  type: number
+  exclusiveMinimum: 0
+  unit: Hz
 VolumeTiming:
   name: VolumeTiming
   display_name: Volume Timing
@@ -4638,3 +4769,65 @@ iEEGReference:
     this field should have a general description and the channel specific
     reference should be defined in the `channels.tsv` file.
   type: string
+
+AudioDuration:
+  name: AudioDuration
+  display_name: Audio Duration
+  description: |
+    Duration of the audio recording in seconds.
+  type: number
+  exclusiveMinimum: 0
+  unit: s
+
+AudioBitDepth:
+  name: AudioBitDepth
+  display_name: Audio Bit Depth
+  description: |
+    Number of bits per sample in the audio recording.
+
+    Common values include `16`, `24`, or `32`.
+  type: integer
+  minimum: 1
+
+CameraPosition:
+  name: CameraPosition
+  display_name: Camera Position
+  description: |
+    Free-form description of the camera placement relative to the subject or scene.
+
+    Examples include "front", "profile-left", "ceiling", "room-corner", or "overhead".
+  type: string
+
+Duration:
+  name: Duration
+  display_name: Duration
+  description: |
+    Total duration of the audio or video recording in seconds.
+  type: number
+  exclusiveMinimum: 0
+  unit: s
+
+FrameRate:
+  name: FrameRate
+  display_name: Frame Rate
+  description: |
+    Frame rate of the video recording in frames per second (for example, `30.0`).
+  type: number
+  exclusiveMinimum: 0
+  unit: Hz
+
+Height:
+  name: Height
+  display_name: Video Height
+  description: |
+    Height of the video in pixels (for example, `1080`).
+  type: integer
+  minimum: 1
+
+Width:
+  name: Width
+  display_name: Video Width
+  description: |
+    Width of the video in pixels (for example, `1920`).
+  type: integer
+  minimum: 1
diff --git a/src/schema/objects/suffixes.yaml b/src/schema/objects/suffixes.yaml
index 37cca59edf..6788bfcd92 100644
--- a/src/schema/objects/suffixes.yaml
+++ b/src/schema/objects/suffixes.yaml
@@ -516,6 +516,18 @@ asl:
     The complete ASL time series stored as a 4D NIfTI file in the original
     acquisition order, with possible volume types including: control, label,
     m0scan, deltam, cbf.
+audio:
+  value: audio
+  display_name: Audio file
+  description: |
+    An audio data file containing one or more audio streams.
+    Common formats include WAV (uncompressed), MP3, AAC, and Ogg Vorbis.
+audiovideo:
+  value: audiovideo
+  display_name: Audio-video file
+  description: |
+    A media file containing both audio and video streams.
+    Common containers include MP4, MKV, AVI, and WebM.
 aslcontext:
   value: aslcontext
   display_name: Arterial Spin Labeling Context
@@ -666,6 +678,12 @@ ieeg:
   display_name: Intracranial Electroencephalography
   description: |
     Intracranial electroencephalography recording data.
+image:
+  value: image
+  display_name: Image file
+  description: |
+    A still image data file.
+    Common formats include JPEG, PNG, SVG, WebP, and TIFF.
 inplaneT1:
   value: inplaneT1
   display_name: Inplane T1
@@ -897,3 +915,9 @@ unloc:
   description: |
     MRS acquisitions run without localization.
     This includes signals detected using coil sensitivity only.
+video:
+  value: video
+  display_name: Video file
+  description: |
+    A video data file containing one or more video streams but no audio.
+    Common containers include MP4, MKV, AVI, and WebM.
diff --git a/src/schema/rules/files/raw/beh.yaml b/src/schema/rules/files/raw/beh.yaml
index df6f9dac06..4d96c62b41 100644
--- a/src/schema/rules/files/raw/beh.yaml
+++ b/src/schema/rules/files/raw/beh.yaml
@@ -9,3 +9,88 @@ noncontinuous:
     - .json
   datatypes:
     - beh
+  entities:
+    subject: required
+    session: optional
+    task: required
+    acquisition: optional
+    run: optional
+
+# Audio recordings
+audio:
+  suffixes:
+    - audio
+  extensions:
+    - .flac
+    - .mp3
+    - .ogg
+    - .wav
+    - .json
+  datatypes:
+    - beh
+  entities:
+    subject: required
+    session: optional
+    task: optional
+    acquisition: optional
+    run: optional
+    recording: optional
+    split: optional
+
+# Video recordings
+video:
+  suffixes:
+    - video
+  extensions:
+    - .mp4
+    - .mkv
+    - .avi
+    - .json
+  datatypes:
+    - beh
+  entities:
+    subject: required
+    session: optional
+    task: optional
+    acquisition: optional
+    run: optional
+    recording: optional
+    split: optional
+
+# Combined audio-video recordings
+audiovideo:
+  suffixes:
+    - audiovideo
+  extensions:
+    - .mp4
+    - .mkv
+    - .avi
+    - .json
+  datatypes:
+    - beh
+  entities:
+    subject: required
+    session: optional
+    task: optional
+    acquisition: optional
+    run: optional
+    recording: optional
+    split: optional
+
+# Still images
+image:
+  suffixes:
+    - image
+  extensions:
+    - .jpg
+    - .png
+    - .json
+  datatypes:
+    - beh
+  entities:
+    subject: required
+    session: optional
+    task: optional
+    acquisition: optional
+    run: optional
+    recording: optional
diff --git a/src/schema/rules/sidecars/beh.yaml b/src/schema/rules/sidecars/beh.yaml
index f2d8410914..545e7646f5 100644
--- a/src/schema/rules/sidecars/beh.yaml
+++ b/src/schema/rules/sidecars/beh.yaml
@@ -25,3 +25,48 @@ BEHInstitutionInformation:
     InstitutionName: recommended
     InstitutionAddress: recommended
     InstitutionalDepartmentName: recommended
+
+# Audio, Video, and Image metadata
+AudioVideoImageDevice:
+  selectors:
+    - datatype == "beh"
+    - intersects([suffix], ["audio", "video", "audiovideo", "behimage"])
+  fields:
+    Device: optional
+    DeviceSerialNumber: optional
+    License: optional
+
+AudioVideoDuration:
+  selectors:
+    - datatype == "beh"
+    - intersects([suffix], ["audio", "video", "audiovideo"])
+  fields:
+    Duration: optional
+
+AudioStreams:
+  selectors:
+    - datatype == "beh"
+    - intersects([suffix], ["audio", "audiovideo"])
+  fields:
+    AudioChannelCount: optional
+    AudioSampleRate: optional
+    AudioBitDepth: optional
+
+VideoStreams:
+  selectors:
+    - datatype == "beh"
+    - intersects([suffix], ["video", "audiovideo"])
+  fields:
+    FrameRate: optional
+    Height: optional
+    Width: optional
+    CameraPosition: optional
+
+ImageProperties:
+  selectors:
+    - datatype == "beh"
+    - suffix == "behimage"
+  fields:
+    Height: optional
+    Width: optional
+    CameraPosition: optional
diff --git a/src/schema/rules/sidecars/media.yaml b/src/schema/rules/sidecars/media.yaml
new file mode 100644
index 0000000000..9e5dc25ed4
--- /dev/null
+++ b/src/schema/rules/sidecars/media.yaml
@@ -0,0 +1,37 @@
+#
+# Groups of related metadata fields for media files
+#
+
+---
+MediaDuration:
+  selectors:
+    - intersects([suffix], ["audio", "video", "audiovideo"])
+  fields:
+    RecordingDuration: recommended
+
+MediaAudioProperties:
+  selectors:
+    - intersects([suffix], ["audio", "audiovideo"])
+  fields:
+    AudioCodec: recommended
+    AudioSampleRate: recommended
+    AudioChannelCount: recommended
+    AudioCodecRFC6381: optional
+
+MediaImageProperties:
+  selectors:
+    - intersects([suffix], ["video", "audiovideo", "image"])
+  fields:
+    ImageWidth: recommended
+    ImageHeight: recommended
+    ImagePixelFormat: optional
+    ImageBitDepth: optional
+
+MediaVideoProperties:
+  selectors:
+    - intersects([suffix], ["video", "audiovideo"])
+  fields:
+    VideoCodec: recommended
+    VideoFrameRate: recommended
+    VideoFrameCount: recommended
+    VideoCodecRFC6381: optional
diff --git a/tools/mkdocs_macros_bids/macros.py b/tools/mkdocs_macros_bids/macros.py
index 2e7c2f893e..7738dcf295 100644
--- a/tools/mkdocs_macros_bids/macros.py
+++ b/tools/mkdocs_macros_bids/macros.py
@@ -203,6 +203,52 @@ def make_suffix_table(suffixes, src_path=None):
     return table
 
 
+def make_extension_table(extensions, src_path=None):
+    """Generate a markdown table of file extension information.
+
+    Parameters
+    ----------
+    extensions : list of str
+        A list of the extension keys to include in the table.
+        Keys correspond to entries in the schema's objects.extensions
+        (for example, ``["wav", "mp3", "aac", "ogg"]``).
+    src_path : str or None
+        The file where this macro is called, which may be explicitly provided
+        by the "page.file.src_path" variable.
+
+    Returns
+    -------
+    table : str
+        A Markdown-format table containing the extension information.
+    """
+    if src_path is None:
+        src_path = _get_source_path()
+
+    schema_obj = schema.load_schema()
+    ext_objects = schema_obj["objects"]["extensions"]
+
+    # Compute the relative path to the glossary from the calling file
+    src_dir = os.path.dirname(src_path)
+    glossary_path = os.path.relpath("glossary.md", src_dir)
+
+    rows = []
+    for ext_key in extensions:
+        ext = ext_objects[ext_key]
+        value = ext["value"]
+        display_name = ext["display_name"]
+        # Collapse multi-line description to single line
+        description = " ".join(ext["description"].strip().split())
+
+        # Link to glossary anchor
+        link = f"[{value}]({glossary_path}#objects.extensions.{ext_key})"
+
+        rows.append(f"| {display_name} | {link} | {description} |")
+
+    header = "| **Format** | **Extension** | **Description** |"
+    separator = "| --- | --- | --- |"
+    return "\n".join([header, separator] + rows)
+
+
 def make_metadata_table(field_info, src_path=None):
     """Generate a markdown table of metadata field information.
 
diff --git a/tools/mkdocs_macros_bids/main.py b/tools/mkdocs_macros_bids/main.py
index 7fa873247a..e4cbd2ba70 100644
--- a/tools/mkdocs_macros_bids/main.py
+++ b/tools/mkdocs_macros_bids/main.py
@@ -38,6 +38,7 @@ def define_env(env):
     )
     env.macro(macros.make_glossary, "MACROS___make_glossary")
     env.macro(macros.make_suffix_table, "MACROS___make_suffix_table")
+    env.macro(macros.make_extension_table, "MACROS___make_extension_table")
     env.macro(macros.make_metadata_table, "MACROS___make_metadata_table")
     env.macro(macros.make_json_table, "MACROS___make_json_table")
     env.macro(macros.make_sidecar_table, "MACROS___make_sidecar_table")
diff --git a/tools/schemacode/src/bidsschematools/tests/test_render_tables.py b/tools/schemacode/src/bidsschematools/tests/test_render_tables.py
index 22676689d0..7cd77ad951 100644
--- a/tools/schemacode/src/bidsschematools/tests/test_render_tables.py
+++ b/tools/schemacode/src/bidsschematools/tests/test_render_tables.py
@@ -1,8 +1,16 @@
 """Tests for the bidsschematools package."""
 
+import sys
+from pathlib import Path
+
 from bidsschematools.render import tables
 from bidsschematools.render.utils import normalize_requirements
 
+# Make mkdocs_macros_bids importable
+_macros_dir = Path(__file__).parents[5] / "tools" / "mkdocs_macros_bids"
+if str(_macros_dir) not in sys.path:
+    sys.path.insert(0, str(_macros_dir))
+
 
 def test_make_entity_table(schema_obj):
     """
@@ -145,3 +153,39 @@ def test_make_columns_table(schema_obj):
         assert level.upper() in render_row
         assert level_addendum.split("\n")[0] in render_row
         assert description_addendum.split("\n")[0] in render_row
+
+
+def test_make_extension_table(schema_obj):
+    """Test whether expected extensions are present and listed correctly.
+
+    This tests the make_extension_table macro from mkdocs_macros_bids.
+    """
+    import macros as mkdocs_macros  # type: ignore[import-not-found]
+
+    target_extensions = ["wav", "mp4", "jpg"]
+    table = mkdocs_macros.make_extension_table(
+        target_extensions,
+        src_path="appendices/media-files.md",
+    )
+
+    rendered_lines = table.split("\n")
+
+    # Header and separator
+    assert rendered_lines[0].startswith("| **Format**")
+    assert rendered_lines[1].startswith("| ---")
+
+    # One data row per extension
+    assert len(rendered_lines) == len(target_extensions) + 2
+
+    # Check each extension is rendered with correct display name and value
+    expected = {
+        "wav": (".wav", "Waveform Audio"),
+        "mp4": (".mp4", "MPEG-4 Part 14"),
+        "jpg": (".jpg", "Joint Photographic Experts Group"),
+    }
+    for ext_key, render_row in zip(target_extensions, rendered_lines[2:]):
+        value, display_name = expected[ext_key]
+        assert display_name in render_row
+        assert value in render_row
+        # Glossary link
+        assert f"glossary.md#objects.extensions.{ext_key}" in render_row