diff --git a/9_Firmware/9_2_FPGA/tb/cosim/compare_independent.py b/9_Firmware/9_2_FPGA/tb/cosim/compare_independent.py
new file mode 100644
index 0000000..89c9bf0
--- /dev/null
+++ b/9_Firmware/9_2_FPGA/tb/cosim/compare_independent.py
@@ -0,0 +1,416 @@
+#!/usr/bin/env python3
+"""
+T-6 drift cosim — twin (fpga_model.py, RTL-mirroring) vs reference
+(fpga_reference.py, numpy-based).
+
+Designed to catch the bug class where a transcription error exists
+identically in BOTH the bit-exact Python twin and the RTL (e.g. a
+hand-copied LUT entry, a regenerated .mem with wrong arithmetic, a
+window coefficient typo). The existing cosim suite cannot detect that
+class because both sides of the comparison are computing the same
+algorithm; this script gives an independent third leg.
+
+Strategy (from highest-value to lowest):
+
+  1. Bytewise LUT spot-checks — every entry of every LUT/ROM compared
+     to its analytical Q15 value. A single corrupted entry fails here.
+       * NCO_SINE_LUT (64 entries, sin(pi*k/128) Q15)
+       * Twiddle ROMs (.mem files) for N=16, N=2048
+       * HAMMING_WINDOW (16 entries, sym Hamming N=16 Q15)
+
+  2. End-to-end peak-position invariants — feed canonical inputs
+     through both twin and reference; the peak bin/index must match.
+
+  3. Roundtrip / structural invariants — FFT->IFFT recovers input,
+     NCO output stays on the unit circle, etc.
+
+We intentionally avoid Q15-saturated magnitude comparisons; those are
+where the twin's bit-exact Q15 saturation differs from numpy's float
+output, and they don't represent a real drift.
+"""
+
+from __future__ import annotations
+
+import math
+import sys
+from pathlib import Path
+
+import numpy as np
+
+# Make local imports work when invoked from anywhere
+THIS_DIR = Path(__file__).resolve().parent
+sys.path.insert(0, str(THIS_DIR))
+
+import fpga_reference as ref  # noqa: E402
+from fpga_model import (  # noqa: E402
+    HAMMING_WINDOW,
+    NCO,
+    DopplerProcessor,
+    FFTEngine,
+    MatchedFilterChain,
+    NCO_SINE_LUT,
+    load_twiddle_rom,
+)
+
+
+# =============================================================================
+# Tolerances
+# =============================================================================
+
+TOL_NCO_LUT_LSB        = 1     # NCO_SINE_LUT: tightest possible
+TOL_TWIDDLE_LSB        = 1     # twiddle ROMs: same — quarter-wave Q15 cosine
+TOL_HAMMING_LSB        = 4     # 4 LSB ≈ 1.2e-4 rounding budget on Q15 round
+TOL_NCO_MAG_REL        = 0.04  # quarter-wave LUT artifact at quadrant edges
+TOL_FFT_ROUNDTRIP_LSB  = 60    # 11 stages × Q15 noise on 2048-pt; empirical
+
+
+# =============================================================================
+# Result tracker
+# =============================================================================
+
+class CheckResult:
+    def __init__(self):
+        self.pass_count = 0
+        self.fail_count = 0
+
+    def check(self, ok: bool, label: str, detail: str = ""):
+        tag = "PASS" if ok else "FAIL"
+        msg = f"  [{tag}] {label}"
+        if detail:
+            msg += f" — {detail}"
+        print(msg)
+        if ok:
+            self.pass_count += 1
+        else:
+            self.fail_count += 1
+
+    def info(self, label: str, detail: str = ""):
+        print(f"  [INFO] {label}" + (f" — {detail}" if detail else ""))
+
+    def summary(self):
+        total = self.pass_count + self.fail_count
+        print(f"\n  RESULT: {self.pass_count}/{total} drift checks passed")
+        return self.fail_count == 0
+
+
+# =============================================================================
+# Section 1 — bytewise LUT spot-checks
+# =============================================================================
+
+def check_nco_lut(result: CheckResult):
+    print("\n--- NCO_SINE_LUT bytewise check ---")
+    max_dev = 0
+    bad = []
+    for k in range(64):
+        ideal = round(32767.0 * math.sin(math.pi * k / 128.0))
+        dev = abs(NCO_SINE_LUT[k] - ideal)
+        if dev > max_dev:
+            max_dev = dev
+        if dev > TOL_NCO_LUT_LSB:
+            bad.append((k, NCO_SINE_LUT[k], ideal, dev))
+    result.check(
+        max_dev <= TOL_NCO_LUT_LSB,
+        f"NCO_SINE_LUT: all 64 entries match sin(pi*k/128) Q15 (tol {TOL_NCO_LUT_LSB} LSB)",
+        f"max |LUT - ideal| = {max_dev} LSB" + (
+            f"; {len(bad)} entries off, e.g. k={bad[0][0]}: LUT={bad[0][1]}, "
+            f"ideal={bad[0][2]}" if bad else ""
+        )
+    )
+
+
+def check_twiddle_rom(result: CheckResult, n: int, mem_filename: str):
+    print(f"\n--- Twiddle ROM ({mem_filename}, N={n}) bytewise check ---")
+    rom_path = Path(__file__).resolve().parents[2] / mem_filename
+    if not rom_path.exists():
+        result.check(False, f"Twiddle ROM file present at {rom_path}", "missing")
+        return
+    cos_rom = load_twiddle_rom(str(rom_path), n=n)
+    expected_entries = n // 4
+    if len(cos_rom) != expected_entries:
+        result.check(
+            False,
+            f"Twiddle ROM length = N/4 = {expected_entries}",
+            f"got {len(cos_rom)}"
+        )
+        return
+
+    max_dev = 0
+    bad = []
+    for k in range(expected_entries):
+        ideal = round(32767.0 * math.cos(2.0 * math.pi * k / n))
+        # Q15 representation tops at 32767 — ideal cos(0)=32768 rounds down.
+        if ideal == 32768:
+            ideal = 32767
+        dev = abs(cos_rom[k] - ideal)
+        if dev > max_dev:
+            max_dev = dev
+        if dev > TOL_TWIDDLE_LSB:
+            bad.append((k, cos_rom[k], ideal, dev))
+    result.check(
+        max_dev <= TOL_TWIDDLE_LSB,
+        f"{mem_filename}: all {expected_entries} entries match cos(2pi*k/{n}) Q15 (tol {TOL_TWIDDLE_LSB} LSB)",
+        f"max |ROM - ideal| = {max_dev} LSB" + (
+            f"; {len(bad)} bad, e.g. k={bad[0][0]}: ROM={bad[0][1]}, ideal={bad[0][2]}"
+            if bad else ""
+        )
+    )
+
+
+def check_hamming_lut(result: CheckResult):
+    print("\n--- HAMMING_WINDOW bytewise check ---")
+    win_lut = np.array(HAMMING_WINDOW, dtype=np.int64)
+    win_ref = np.round(ref.hamming_16_ideal()).astype(np.int64)
+    diff = np.abs(win_lut - win_ref)
+    max_dev = int(diff.max())
+    worst_idx = int(np.argmax(diff))
+    bad = [(int(i), int(win_lut[i]), int(win_ref[i]), int(diff[i]))
+           for i in range(16) if diff[i] > TOL_HAMMING_LSB]
+    result.check(
+        max_dev <= TOL_HAMMING_LSB,
+        f"HAMMING_WINDOW: all 16 entries match 0.54-0.46*cos(2pi*n/15) Q15 (tol {TOL_HAMMING_LSB} LSB)",
+        f"max |LUT - ideal| = {max_dev} LSB at n={worst_idx} "
+        f"(LUT={int(win_lut[worst_idx])}, ideal={int(win_ref[worst_idx])})"
+    )
+    if bad:
+        result.info(
+            "HAMMING_WINDOW drift detected — both fpga_model.py and "
+            "doppler_processor.v contain values that diverge from the "
+            "documented formula. Resolution: either update the comments to "
+            "reflect the actual values, or regen the LUTs to match the "
+            "formula and re-bless the affected goldens."
+        )
+
+
+# =============================================================================
+# Section 2 — end-to-end peak / structural invariants
+# =============================================================================
+
+def check_nco_invariants(result: CheckResult):
+    """NCO output should stay on unit circle and have the right frequency."""
+    print("\n--- NCO output invariants (unit-circle, frequency) ---")
+    ftw = 0x4CCCCCCD       # 120 MHz at 400 MSPS = 0.30 cycles/sample
+    n_capture = 1024
+
+    nco = NCO()
+    cos_stream: list[int] = []
+    sin_stream: list[int] = []
+    for _ in range(n_capture + 64):
+        s, c, ready = nco.step(ftw)
+        if ready:
+            cos_stream.append(c)
+            sin_stream.append(s)
+        if len(cos_stream) >= n_capture:
+            break
+    cos_arr = np.array(cos_stream, dtype=np.float64)
+    sin_arr = np.array(sin_stream, dtype=np.float64)
+
+    # Unit-circle invariant: cos² + sin² ≈ Q15_max². The quarter-wave LUT
+    # uses sin(k*pi/128) for k=0..63 — at the right edge of each quadrant
+    # it falls back to LUT[0]=0 instead of cos(63*pi/128)≈sin(pi/128), so
+    # the magnitude can dip by up to ~3 % of full scale. Tolerance 4 % covers
+    # this expected architectural artifact.
+    mag_ratio = (cos_arr ** 2 + sin_arr ** 2) / (32767.0 ** 2)
+    max_mag_dev = float(np.max(np.abs(mag_ratio - 1.0)))
+    result.check(
+        max_mag_dev <= TOL_NCO_MAG_REL,
+        "NCO output on unit circle (cos²+sin² ≈ 1)",
+        f"max |mag²-1| = {max_mag_dev * 100:.2f} % (tol {TOL_NCO_MAG_REL * 100:.0f} %)"
+    )
+
+    # Frequency invariant: dominant FFT bin of cos+j*sin should equal
+    # round(ftw / 2^32 * N).
+    z = cos_arr + 1j * sin_arr
+    Z = np.fft.fft(z)
+    peak_bin = int(np.argmax(np.abs(Z)))
+    expected_bin = int(round(ftw / (1 << 32) * n_capture))
+    result.check(
+        abs(peak_bin - expected_bin) <= 1,
+        f"NCO dominant frequency at FTW = {ftw:08X} (expected bin {expected_bin})",
+        f"got bin {peak_bin}"
+    )
+
+
+def check_fft_invariants(result: CheckResult):
+    """FFT structural sanity: impulse, roundtrip, peak position."""
+    print("\n--- FFT-2048 invariants (peak position, roundtrip, impulse) ---")
+    n = 2048
+    fft = FFTEngine(n=n)
+
+    # Impulse → flat spectrum at amplitude (no saturation; amp < 32767/N is overkill).
+    in_re = [1000] + [0] * (n - 1)
+    in_im = [0] * n
+    twin_re, twin_im = fft.compute(in_re, in_im, inverse=False)
+    flat_max = max(max(twin_re) - 1000, 1000 - min(twin_re),
+                   max(twin_im), -min(twin_im))
+    result.check(
+        flat_max <= 5,
+        "FFT-2048(impulse): all bins ≈ amplitude (1000)",
+        f"max |bin - 1000| = {flat_max}"
+    )
+
+    # Single COMPLEX tone (cos + j*sin) → single peak at bin_k (no conjugate
+    # at N-bin_k as you'd get with a real cosine). Amp small enough that
+    # peak = amp*N stays below Q15 saturation (32767).
+    bin_k = 137
+    amp = 15
+    in_re = [int(round(amp * math.cos(2 * math.pi * bin_k * i / n))) for i in range(n)]
+    in_im = [int(round(amp * math.sin(2 * math.pi * bin_k * i / n))) for i in range(n)]
+    twin_re, twin_im = fft.compute(in_re, in_im, inverse=False)
+    ref_re, ref_im = ref.fft_reference(in_re, in_im, n=n)
+    twin_mag2 = np.array(twin_re) ** 2 + np.array(twin_im) ** 2
+    ref_mag2 = np.asarray(ref_re) ** 2 + np.asarray(ref_im) ** 2
+    twin_peak = int(np.argmax(twin_mag2))
+    ref_peak = int(np.argmax(ref_mag2))
+    result.check(
+        twin_peak == ref_peak == bin_k,
+        f"FFT-2048(complex tone): peak at bin {bin_k}",
+        f"twin={twin_peak}, ref={ref_peak}"
+    )
+
+    # Roundtrip — small amplitude (peak = amp*N/2 ≤ 32767 → amp ≤ 32) so the
+    # forward FFT does not saturate, then IFFT should recover input within
+    # 11×Q15 butterfly noise.
+    rt_amp = 30
+    in_re = [int(rt_amp * math.sin(2 * math.pi * 73 * i / n)) for i in range(n)]
+    in_im = [0] * n
+    fwd_re, fwd_im = fft.compute(in_re, in_im, inverse=False)
+    rt_re, _ = fft.compute(fwd_re, fwd_im, inverse=True)
+    rt_max_err = max(abs(rt_re[i] - in_re[i]) for i in range(n))
+    result.check(
+        rt_max_err <= TOL_FFT_ROUNDTRIP_LSB,
+        f"FFT-2048(roundtrip, amp={rt_amp}): FFT->IFFT recovers input within {TOL_FFT_ROUNDTRIP_LSB} LSB",
+        f"max |rt - in| = {rt_max_err}"
+    )
+
+
+def check_mf_invariants(result: CheckResult):
+    """MF: peak must land at the injected delay; both twin and ref must agree."""
+    print("\n--- Matched filter invariants (peak position) ---")
+    n = 2048
+    mf = MatchedFilterChain(fft_size=n)
+
+    delay = 100
+    bin_k = 17
+    amp = 200
+    sig_re = [0] * n
+    sig_im = [0] * n
+    ref_re_in = [0] * n
+    ref_im_in = [0] * n
+    pulse_len = 256
+    for i in range(pulse_len):
+        ref_re_in[i] = int(round(amp * math.cos(2 * math.pi * bin_k * i / pulse_len)))
+        ref_im_in[i] = int(round(amp * math.sin(2 * math.pi * bin_k * i / pulse_len)))
+        sig_re[i + delay] = ref_re_in[i]
+        sig_im[i + delay] = ref_im_in[i]
+
+    twin_re, twin_im = mf.process(sig_re, sig_im, ref_re_in, ref_im_in)
+    ref_real, ref_imag = ref.matched_filter_reference(
+        sig_re, sig_im, ref_re_in, ref_im_in, fft_size=n
+    )
+    twin_mag = np.sqrt(np.array(twin_re) ** 2 + np.array(twin_im) ** 2)
+    ref_mag = np.sqrt(np.asarray(ref_real) ** 2 + np.asarray(ref_imag) ** 2)
+    twin_peak = int(np.argmax(twin_mag))
+    ref_peak = int(np.argmax(ref_mag))
+    result.check(
+        twin_peak == ref_peak == delay,
+        f"MF: peak at injected delay (bin {delay})",
+        f"twin={twin_peak}, ref={ref_peak}"
+    )
+
+    # Sidelobe behaviour: peak should be N×stronger than median.
+    twin_peak_val = float(twin_mag[delay])
+    twin_median = float(np.median(twin_mag))
+    pk_ratio = twin_peak_val / max(twin_median, 1.0)
+    result.check(
+        pk_ratio >= 5.0,
+        "MF peak-to-median ratio ≥ 5",
+        f"got ratio {pk_ratio:.2f}"
+    )
+
+
+def check_doppler_invariants(result: CheckResult):
+    """Doppler: peak per sub-frame must be at the injected Doppler bin."""
+    print("\n--- Doppler invariants (peak per sub-frame) ---")
+    chirps_per_frame = 48
+    range_bins = 16  # keep small; algorithm identical at any size
+    num_subframes = 3
+    chirps_per_subframe = 16
+
+    inject = [(0, 5), (1, 11), (2, 3)]
+    target_rbin = 10
+    amp = 1500  # avoid Q15 saturation: peak = amp * N/2 = 12000 < 32767
+    chirp_i = np.zeros((chirps_per_frame, range_bins), dtype=np.int64)
+    chirp_q = np.zeros((chirps_per_frame, range_bins), dtype=np.int64)
+    for sf, dop_bin in inject:
+        for c in range(chirps_per_subframe):
+            chirp_idx = sf * chirps_per_subframe + c
+            phase = 2 * math.pi * dop_bin * c / chirps_per_subframe
+            chirp_i[chirp_idx, target_rbin] = int(round(amp * math.cos(phase)))
+            chirp_q[chirp_idx, target_rbin] = int(round(amp * math.sin(phase)))
+
+    dop = DopplerProcessor(num_subframes=num_subframes,
+                           chirps_per_frame=chirps_per_frame)
+    dop.RANGE_BINS = range_bins
+    twin_dop_i, twin_dop_q = dop.process_frame(
+        chirp_i.tolist(), chirp_q.tolist()
+    )
+    ref_re, ref_im = ref.doppler_reference(
+        chirp_i, chirp_q,
+        num_subframes=num_subframes,
+        chirps_per_subframe=chirps_per_subframe,
+        range_bins=range_bins,
+    )
+
+    twin_mag = np.sqrt(np.array(twin_dop_i) ** 2 + np.array(twin_dop_q) ** 2)
+    ref_mag = np.sqrt(ref_re ** 2 + ref_im ** 2)
+    expected_bins = [sf * chirps_per_subframe + b for sf, b in inject]
+
+    sf_peaks_twin = []
+    sf_peaks_ref = []
+    for sf in range(num_subframes):
+        lo = sf * chirps_per_subframe
+        hi = lo + chirps_per_subframe
+        sf_peaks_twin.append(lo + int(np.argmax(twin_mag[target_rbin, lo:hi])))
+        sf_peaks_ref.append(lo + int(np.argmax(ref_mag[target_rbin, lo:hi])))
+
+    result.check(
+        sf_peaks_twin == expected_bins,
+        f"Doppler twin: peaks per sub-frame at {expected_bins}",
+        f"got {sf_peaks_twin}"
+    )
+    result.check(
+        sf_peaks_ref == expected_bins,
+        f"Doppler reference: peaks per sub-frame at {expected_bins}",
+        f"got {sf_peaks_ref}"
+    )
+
+
+# =============================================================================
+# Main
+# =============================================================================
+
+def main():
+    print("============================================================")
+    print("  T-6 INDEPENDENT REFERENCE DRIFT COSIM")
+    print("    fpga_model.py (twin) vs fpga_reference.py (numpy ideal)")
+    print("============================================================")
+    result = CheckResult()
+
+    # 1. LUT bytewise spot-checks (highest-value transcription detector)
+    check_nco_lut(result)
+    check_twiddle_rom(result, n=16,   mem_filename="fft_twiddle_16.mem")
+    check_twiddle_rom(result, n=2048, mem_filename="fft_twiddle_2048.mem")
+    check_hamming_lut(result)
+
+    # 2/3. End-to-end invariants
+    check_nco_invariants(result)
+    check_fft_invariants(result)
+    check_mf_invariants(result)
+    check_doppler_invariants(result)
+
+    ok = result.summary()
+    return 0 if ok else 1
+
+
+if __name__ == '__main__':
+    sys.exit(main())
diff --git a/9_Firmware/9_2_FPGA/tb/cosim/fpga_reference.py b/9_Firmware/9_2_FPGA/tb/cosim/fpga_reference.py
new file mode 100644
index 0000000..c913a75
--- /dev/null
+++ b/9_Firmware/9_2_FPGA/tb/cosim/fpga_reference.py
@@ -0,0 +1,245 @@
+#!/usr/bin/env python3
+r"""
+Independent floating-point reference for AERIS-10 signal processing chain.
+
+Unlike fpga_model.py (which is a bit-exact PORT of the RTL — same NCO LUT,
+same twiddle ROMs, same Q15 quantization), this module computes the
+algorithm using ideal numpy/scipy primitives. It is the second leg of the
+T-6 three-way triangulation:
+
+    fpga_model.py (twin)  --bit-exact--  RTL simulation
+              \                            /
+               \                          /
+                \-- within tolerance --> /
+                  fpga_reference.py
+                  (this file)
+
+Drift signal interpretation:
+  * twin == reference (within tol) and twin == RTL (bit-exact)
+        -> chain healthy
+  * twin == reference (within tol) but twin != RTL (bit-exact)
+        -> RTL diverged from spec (real RTL bug)
+  * twin != reference (outside tol) but twin == RTL (bit-exact)
+        -> twin and RTL share a transcription error
+           (e.g. wrong NCO_SINE_LUT entry, wrong twiddle ROM, wrong window
+           coefficients). This is the bug class T-6 was opened to catch.
+
+Coverage in this revision (highest transcription risk first):
+  * NCO  — numpy.cos/sin vs the 64-entry quarter-wave NCO_SINE_LUT
+  * FFT  — numpy.fft.fft/ifft vs Q15 twiddle ROM butterfly
+  * MF   — numpy ifft(fft(sig) * conj(fft(ref))) vs RTL pipeline
+  * Doppler — numpy.fft + ideal Hamming window vs Q15 window LUT + RTL FFT
+
+Out of scope here (lower transcription risk, deferred):
+  * CIC / FIR — coefficient files are derived from independent generators
+  * Mixer / RangeBinDecimator — trivially correct in both
+"""
+
+from __future__ import annotations
+
+import numpy as np
+
+
+# =============================================================================
+# NCO reference — ideal complex sinusoid
+# =============================================================================
+
+def nco_reference(num_samples: int, ftw: int, fs: float = 400e6,
+                  phase_offset_deg: float = 0.0):
+    """Ideal floating-point NCO output, scaled to match Q15 fpga_model.
+
+    The fpga_model.NCO uses a 32-bit phase accumulator stepped by ftw, with
+    a 64-entry quarter-wave cos LUT scaled to ±0x7FFF. This reference uses
+    numpy.cos/sin directly with no LUT.
+
+    Args:
+        num_samples: number of samples to produce
+        ftw: 32-bit unsigned phase increment per sample
+        fs: sample rate (Hz)
+        phase_offset_deg: phase offset (degrees), default 0
+
+    Returns:
+        (cos_q15, sin_q15) — float arrays of length num_samples, in Q15 scale
+    """
+    ftw = int(ftw) & 0xFFFFFFFF
+    n = np.arange(num_samples, dtype=np.float64)
+    # Phase accumulator semantics: phase[k] = (k * ftw) mod 2^32
+    # Convert to radians: 2*pi * phase / 2^32
+    phase_rad = 2.0 * np.pi * (n * ftw) / (1 << 32) + np.deg2rad(phase_offset_deg)
+    cos_q15 = np.cos(phase_rad) * 32767.0
+    sin_q15 = np.sin(phase_rad) * 32767.0
+    return cos_q15, sin_q15
+
+
+# =============================================================================
+# FFT reference — numpy.fft.fft / ifft
+# =============================================================================
+
+def fft_reference(in_re, in_im, n: int = 2048, inverse: bool = False):
+    """Ideal floating-point FFT.
+
+    Scaling matches the RTL convention:
+      forward: y[k] = sum_n x[n] * exp(-j*2*pi*k*n/N)            (no 1/N)
+      inverse: y[n] = (1/N) * sum_k X[k] * exp(+j*2*pi*k*n/N)    (1/N applied)
+
+    The RTL fft_engine implements >>>LOG2N before output saturation when
+    inverse=1, which is the same 1/N. numpy.fft.ifft already includes the
+    1/N factor, so we use it directly with no rescaling.
+
+    Args:
+        in_re/in_im: length-N int or float sequences
+        n: FFT size (16, 1024, 2048)
+        inverse: True for IFFT
+
+    Returns:
+        (out_re, out_im) — float arrays, no Q15 saturation
+    """
+    re = np.asarray(in_re, dtype=np.float64)
+    im = np.asarray(in_im, dtype=np.float64)
+    if len(re) != n or len(im) != n:
+        raise ValueError(f"input length {len(re)} != N={n}")
+    x = re + 1j * im
+    y = np.fft.ifft(x) if inverse else np.fft.fft(x)
+    return y.real.copy(), y.imag.copy()
+
+
+# =============================================================================
+# Matched filter reference — IFFT(FFT(sig) * conj(FFT(ref)))
+# =============================================================================
+
+def matched_filter_reference(sig_re, sig_im, ref_re, ref_im, fft_size: int = 2048):
+    """Ideal range profile via FFT-domain matched filter.
+
+    range_profile = IFFT( FFT(sig) * conj(FFT(ref)) )
+
+    The RTL pipeline does the same operation but with Q15 twiddles, 32-bit
+    accumulators, and Q15 round-and-saturate after the conjugate multiply.
+    This reference is unquantized.
+
+    Args:
+        sig_re/im, ref_re/im: length-fft_size sequences (int or float)
+        fft_size: matched filter FFT size (default 2048)
+
+    Returns:
+        (range_re, range_im) — float arrays, no Q15 saturation
+    """
+    sig_re = np.asarray(sig_re, dtype=np.float64)
+    sig_im = np.asarray(sig_im, dtype=np.float64)
+    ref_re = np.asarray(ref_re, dtype=np.float64)
+    ref_im = np.asarray(ref_im, dtype=np.float64)
+    s = sig_re + 1j * sig_im
+    r = ref_re + 1j * ref_im
+    S = np.fft.fft(s, n=fft_size)
+    R = np.fft.fft(r, n=fft_size)
+    P = S * np.conj(R)
+    p = np.fft.ifft(P)
+    return p.real.copy(), p.imag.copy()
+
+
+# =============================================================================
+# Doppler reference — Hamming-windowed per-sub-frame 16-pt FFT
+# =============================================================================
+
+def hamming_16_ideal():
+    """Ideal Hamming(16) coefficients, scaled to Q15.
+
+    fpga_model.HAMMING_WINDOW is a hard-coded Q15 LUT. If a coefficient was
+    transcribed wrong, this catches it.
+
+    Definition (matches scipy.signal.windows.hamming(16, sym=False) is
+    DIFFERENT — that's a periodic Hamming. The RTL/fpga_model use the
+    classic symmetric form: w[n] = 0.54 - 0.46*cos(2*pi*n/(N-1)).
+    """
+    n = np.arange(16, dtype=np.float64)
+    w = 0.54 - 0.46 * np.cos(2.0 * np.pi * n / 15.0)
+    return w * 32767.0
+
+
+def doppler_reference(chirp_data_i, chirp_data_q,
+                      num_subframes: int = 3,
+                      chirps_per_subframe: int = 16,
+                      range_bins: int = 512):
+    """Ideal Doppler map using ideal Hamming + numpy.fft, no Q15 quantization.
+
+    Args:
+        chirp_data_i/q: 2D arrays [chirps_per_frame][range_bins], int or float
+        num_subframes: number of independent 16-pt FFTs per range bin (default 3)
+        chirps_per_subframe: 16
+        range_bins: 512
+
+    Returns:
+        (doppler_map_re, doppler_map_im) — 2D float arrays
+            shape [range_bins][num_subframes * chirps_per_subframe]
+        Sub-frame s occupies output bins [s*16 .. s*16+15].
+    """
+    chirp_data_i = np.asarray(chirp_data_i, dtype=np.float64)
+    chirp_data_q = np.asarray(chirp_data_q, dtype=np.float64)
+    chirps_per_frame = num_subframes * chirps_per_subframe
+    if chirp_data_i.shape != (chirps_per_frame, range_bins):
+        raise ValueError(
+            f"chirp_data_i shape {chirp_data_i.shape} != "
+            f"({chirps_per_frame}, {range_bins})"
+        )
+
+    win = hamming_16_ideal()  # Q15-scaled
+    total_bins = num_subframes * chirps_per_subframe
+    out_re = np.zeros((range_bins, total_bins), dtype=np.float64)
+    out_im = np.zeros((range_bins, total_bins), dtype=np.float64)
+
+    for rbin in range(range_bins):
+        for sf in range(num_subframes):
+            start = sf * chirps_per_subframe
+            stop = start + chirps_per_subframe
+            offset = sf * chirps_per_subframe
+
+            # Apply Hamming and divide by 32768 to undo the Q15 scaling so
+            # the comparison is to ideal floating-point amplitudes (the RTL
+            # rounds (data*win + 1<<14) >> 15 which is an approximate /32768).
+            x_re = chirp_data_i[start:stop, rbin] * win / 32768.0
+            x_im = chirp_data_q[start:stop, rbin] * win / 32768.0
+            x = x_re + 1j * x_im
+
+            X = np.fft.fft(x)
+            out_re[rbin, offset:offset + chirps_per_subframe] = X.real
+            out_im[rbin, offset:offset + chirps_per_subframe] = X.imag
+
+    return out_re, out_im
+
+
+# =============================================================================
+# Self-test (sanity checks against numpy's own analytical answers)
+# =============================================================================
+
+def _self_test():
+    """Quick sanity checks."""
+    # NCO: at FTW = 0x4CCCCCCD, frequency = 0.3 * fs = 120 MHz at 400 MSPS.
+    cos_q15, sin_q15 = nco_reference(8, 0x4CCCCCCD, fs=400e6)
+    # First sample should be cos(0)=1, sin(0)=0 in Q15
+    assert abs(cos_q15[0] - 32767.0) < 1.0, f"NCO[0].cos = {cos_q15[0]}"
+    assert abs(sin_q15[0]) < 1.0, f"NCO[0].sin = {sin_q15[0]}"
+
+    # FFT: impulse -> all bins = amplitude
+    in_re = [1000] + [0] * 15
+    in_im = [0] * 16
+    out_re, out_im = fft_reference(in_re, in_im, n=16)
+    for k in range(16):
+        assert abs(out_re[k] - 1000.0) < 1e-9, f"FFT impulse bin {k}: {out_re[k]}"
+
+    # Doppler: zero input -> zero output
+    z_i = np.zeros((48, 512))
+    z_q = np.zeros((48, 512))
+    d_re, d_im = doppler_reference(z_i, z_q, num_subframes=3,
+                                   chirps_per_subframe=16, range_bins=512)
+    assert np.max(np.abs(d_re)) < 1e-9
+    assert np.max(np.abs(d_im)) < 1e-9
+
+    # Hamming: peak at center, edges match formula
+    w = hamming_16_ideal()
+    assert abs(w[0] - 0.08 * 32767.0) < 0.5
+    assert abs(w[7] - (0.54 - 0.46 * np.cos(2 * np.pi * 7 / 15)) * 32767.0) < 0.5
+
+    print("fpga_reference self-test: OK")
+
+
+if __name__ == '__main__':
+    _self_test()