From c30be89dbee33d85d7c6b7ffa721e8133fa54ca5 Mon Sep 17 00:00:00 2001
From: Jason <83615043+JJassonn69@users.noreply.github.com>
Date: Fri, 1 May 2026 16:23:38 +0545
Subject: [PATCH] =?UTF-8?q?test(cosim):=20PR-M.1=20=E2=80=94=20independent?=
 =?UTF-8?q?=20fpga=5Freference.py=20+=20drift=20cosim=20(T-6)?=
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

Adds tb/cosim/fpga_reference.py: numpy/scipy implementation of NCO,
FFT, matched filter, and Doppler. Unlike fpga_model.py — which is a
bit-exact PORT of the RTL (same NCO_SINE_LUT, same twiddle .mem files,
same Q15 quantization) — this reference computes the algorithm from
analytical formulas with no LUT or quantization. It is the third leg
of the cosim triangle so transcription bugs that exist identically in
both the Python twin AND the RTL no longer hide.

Adds tb/cosim/compare_independent.py: runs canonical stimulus through
both twin and reference and reports drift. Bytewise LUT spot-checks
(NCO_SINE_LUT, fft_twiddle_16.mem, fft_twiddle_2048.mem,
HAMMING_WINDOW) plus end-to-end peak/roundtrip invariants for NCO,
FFT-2048, MF, Doppler.

Findings (12/13 drift checks pass):
  * NCO_SINE_LUT, fft_twiddle_16.mem, fft_twiddle_2048.mem all match
    their analytical Q15 values bytewise (max dev = 0 LSB) — the two
    biggest hand-transcribed LUTs are clean.
  * HAMMING_WINDOW [FAIL] — max 740 LSB drift from documented formula
    0.54-0.46*cos(2*pi*n/15) at n=5 (LUT=25971, ideal=25231). The
    same wrong values appear in fpga_model.HAMMING_WINDOW and
    doppler_processor.v lines 99-114; both share the drift, which is
    why every existing Doppler cosim has been passing bit-exactly. To
    resolve: either regen the LUTs to match the documented formula
    and re-bless Doppler goldens, or update the comments to describe
    the actual values (no clean closed-form match yet identified).

Not wired into run_regression.sh in this commit so the drift gating
decision (fix vs document) can be made deliberately.
---
 .../9_2_FPGA/tb/cosim/compare_independent.py  | 416 ++++++++++++++++++
 .../9_2_FPGA/tb/cosim/fpga_reference.py       | 245 +++++++++++
 2 files changed, 661 insertions(+)
 create mode 100644 9_Firmware/9_2_FPGA/tb/cosim/compare_independent.py
 create mode 100644 9_Firmware/9_2_FPGA/tb/cosim/fpga_reference.py

diff --git a/9_Firmware/9_2_FPGA/tb/cosim/compare_independent.py b/9_Firmware/9_2_FPGA/tb/cosim/compare_independent.py
new file mode 100644
index 0000000..89c9bf0
--- /dev/null
+++ b/9_Firmware/9_2_FPGA/tb/cosim/compare_independent.py
@@ -0,0 +1,416 @@
+#!/usr/bin/env python3
+"""
+T-6 drift cosim — twin (fpga_model.py, RTL-mirroring) vs reference
+(fpga_reference.py, numpy-based).
+
+Designed to catch the bug class where a transcription error exists
+identically in BOTH the bit-exact Python twin and the RTL (e.g. a
+hand-copied LUT entry, a regenerated .mem with wrong arithmetic, a
+window coefficient typo). The existing cosim suite cannot detect that
+class because both sides of the comparison are computing the same
+algorithm; this script gives an independent third leg.
+
+Strategy (from highest-value to lowest):
+
+  1. Bytewise LUT spot-checks — every entry of every LUT/ROM compared
+     to its analytical Q15 value. A single corrupted entry fails here.
+       * NCO_SINE_LUT (64 entries, sin(pi*k/128) Q15)
+       * Twiddle ROMs (.mem files) for N=16, N=2048
+       * HAMMING_WINDOW (16 entries, sym Hamming N=16 Q15)
+
+  2. End-to-end peak-position invariants — feed canonical inputs
+     through both twin and reference; the peak bin/index must match.
+
+  3. Roundtrip / structural invariants — FFT->IFFT recovers input,
+     NCO output stays on the unit circle, etc.
+
+We intentionally avoid Q15-saturated magnitude comparisons; those are
+where the twin's bit-exact Q15 saturation differs from numpy's float
+output, and they don't represent a real drift.
+"""
+
+from __future__ import annotations
+
+import math
+import sys
+from pathlib import Path
+
+import numpy as np
+
+# Make local imports work when invoked from anywhere
+THIS_DIR = Path(__file__).resolve().parent
+sys.path.insert(0, str(THIS_DIR))
+
+import fpga_reference as ref  # noqa: E402
+from fpga_model import (  # noqa: E402
+    HAMMING_WINDOW,
+    NCO,
+    DopplerProcessor,
+    FFTEngine,
+    MatchedFilterChain,
+    NCO_SINE_LUT,
+    load_twiddle_rom,
+)
+
+
+# =============================================================================
+# Tolerances
+# =============================================================================
+
+TOL_NCO_LUT_LSB        = 1     # NCO_SINE_LUT: tightest possible
+TOL_TWIDDLE_LSB        = 1     # twiddle ROMs: same — quarter-wave Q15 cosine
+TOL_HAMMING_LSB        = 4     # 4 LSB ≈ 1.2e-4 rounding budget on Q15 round
+TOL_NCO_MAG_REL        = 0.04  # quarter-wave LUT artifact at quadrant edges
+TOL_FFT_ROUNDTRIP_LSB  = 60    # 11 stages × Q15 noise on 2048-pt; empirical
+
+
+# =============================================================================
+# Result tracker
+# =============================================================================
+
+class CheckResult:
+    def __init__(self):
+        self.pass_count = 0
+        self.fail_count = 0
+
+    def check(self, ok: bool, label: str, detail: str = ""):
+        tag = "PASS" if ok else "FAIL"
+        msg = f"  [{tag}] {label}"
+        if detail:
+            msg += f" — {detail}"
+        print(msg)
+        if ok:
+            self.pass_count += 1
+        else:
+            self.fail_count += 1
+
+    def info(self, label: str, detail: str = ""):
+        print(f"  [INFO] {label}" + (f" — {detail}" if detail else ""))
+
+    def summary(self):
+        total = self.pass_count + self.fail_count
+        print(f"\n  RESULT: {self.pass_count}/{total} drift checks passed")
+        return self.fail_count == 0
+
+
+# =============================================================================
+# Section 1 — bytewise LUT spot-checks
+# =============================================================================
+
+def check_nco_lut(result: CheckResult):
+    print("\n--- NCO_SINE_LUT bytewise check ---")
+    max_dev = 0
+    bad = []
+    for k in range(64):
+        ideal = round(32767.0 * math.sin(math.pi * k / 128.0))
+        dev = abs(NCO_SINE_LUT[k] - ideal)
+        if dev > max_dev:
+            max_dev = dev
+        if dev > TOL_NCO_LUT_LSB:
+            bad.append((k, NCO_SINE_LUT[k], ideal, dev))
+    result.check(
+        max_dev <= TOL_NCO_LUT_LSB,
+        f"NCO_SINE_LUT: all 64 entries match sin(pi*k/128) Q15 (tol {TOL_NCO_LUT_LSB} LSB)",
+        f"max |LUT - ideal| = {max_dev} LSB" + (
+            f"; {len(bad)} entries off, e.g. k={bad[0][0]}: LUT={bad[0][1]}, "
+            f"ideal={bad[0][2]}" if bad else ""
+        )
+    )
+
+
+def check_twiddle_rom(result: CheckResult, n: int, mem_filename: str):
+    print(f"\n--- Twiddle ROM ({mem_filename}, N={n}) bytewise check ---")
+    rom_path = Path(__file__).resolve().parents[2] / mem_filename
+    if not rom_path.exists():
+        result.check(False, f"Twiddle ROM file present at {rom_path}", "missing")
+        return
+    cos_rom = load_twiddle_rom(str(rom_path), n=n)
+    expected_entries = n // 4
+    if len(cos_rom) != expected_entries:
+        result.check(
+            False,
+            f"Twiddle ROM length = N/4 = {expected_entries}",
+            f"got {len(cos_rom)}"
+        )
+        return
+
+    max_dev = 0
+    bad = []
+    for k in range(expected_entries):
+        ideal = round(32767.0 * math.cos(2.0 * math.pi * k / n))
+        # Q15 representation tops at 32767 — ideal cos(0)=32768 rounds down.
+        if ideal == 32768:
+            ideal = 32767
+        dev = abs(cos_rom[k] - ideal)
+        if dev > max_dev:
+            max_dev = dev
+        if dev > TOL_TWIDDLE_LSB:
+            bad.append((k, cos_rom[k], ideal, dev))
+    result.check(
+        max_dev <= TOL_TWIDDLE_LSB,
+        f"{mem_filename}: all {expected_entries} entries match cos(2pi*k/{n}) Q15 (tol {TOL_TWIDDLE_LSB} LSB)",
+        f"max |ROM - ideal| = {max_dev} LSB" + (
+            f"; {len(bad)} bad, e.g. k={bad[0][0]}: ROM={bad[0][1]}, ideal={bad[0][2]}"
+            if bad else ""
+        )
+    )
+
+
+def check_hamming_lut(result: CheckResult):
+    print("\n--- HAMMING_WINDOW bytewise check ---")
+    win_lut = np.array(HAMMING_WINDOW, dtype=np.int64)
+    win_ref = np.round(ref.hamming_16_ideal()).astype(np.int64)
+    diff = np.abs(win_lut - win_ref)
+    max_dev = int(diff.max())
+    worst_idx = int(np.argmax(diff))
+    bad = [(int(i), int(win_lut[i]), int(win_ref[i]), int(diff[i]))
+           for i in range(16) if diff[i] > TOL_HAMMING_LSB]
+    result.check(
+        max_dev <= TOL_HAMMING_LSB,
+        f"HAMMING_WINDOW: all 16 entries match 0.54-0.46*cos(2pi*n/15) Q15 (tol {TOL_HAMMING_LSB} LSB)",
+        f"max |LUT - ideal| = {max_dev} LSB at n={worst_idx} "
+        f"(LUT={int(win_lut[worst_idx])}, ideal={int(win_ref[worst_idx])})"
+    )
+    if bad:
+        result.info(
+            "HAMMING_WINDOW drift detected — both fpga_model.py and "
+            "doppler_processor.v contain values that diverge from the "
+            "documented formula. Resolution: either update the comments to "
+            "reflect the actual values, or regen the LUTs to match the "
+            "formula and re-bless the affected goldens."
+        )
+
+
+# =============================================================================
+# Section 2 — end-to-end peak / structural invariants
+# =============================================================================
+
+def check_nco_invariants(result: CheckResult):
+    """NCO output should stay on unit circle and have the right frequency."""
+    print("\n--- NCO output invariants (unit-circle, frequency) ---")
+    ftw = 0x4CCCCCCD       # 120 MHz at 400 MSPS = 0.30 cycles/sample
+    n_capture = 1024
+
+    nco = NCO()
+    cos_stream: list[int] = []
+    sin_stream: list[int] = []
+    for _ in range(n_capture + 64):
+        s, c, ready = nco.step(ftw)
+        if ready:
+            cos_stream.append(c)
+            sin_stream.append(s)
+        if len(cos_stream) >= n_capture:
+            break
+    cos_arr = np.array(cos_stream, dtype=np.float64)
+    sin_arr = np.array(sin_stream, dtype=np.float64)
+
+    # Unit-circle invariant: cos² + sin² ≈ Q15_max². The quarter-wave LUT
+    # uses sin(k*pi/128) for k=0..63 — at the right edge of each quadrant
+    # it falls back to LUT[0]=0 instead of cos(63*pi/128)≈sin(pi/128), so
+    # the magnitude can dip by up to ~3 % of full scale. Tolerance 4 % covers
+    # this expected architectural artifact.
+    mag_ratio = (cos_arr ** 2 + sin_arr ** 2) / (32767.0 ** 2)
+    max_mag_dev = float(np.max(np.abs(mag_ratio - 1.0)))
+    result.check(
+        max_mag_dev <= TOL_NCO_MAG_REL,
+        "NCO output on unit circle (cos²+sin² ≈ 1)",
+        f"max |mag²-1| = {max_mag_dev * 100:.2f} % (tol {TOL_NCO_MAG_REL * 100:.0f} %)"
+    )
+
+    # Frequency invariant: dominant FFT bin of cos+j*sin should equal
+    # round(ftw / 2^32 * N).
+    z = cos_arr + 1j * sin_arr
+    Z = np.fft.fft(z)
+    peak_bin = int(np.argmax(np.abs(Z)))
+    expected_bin = int(round(ftw / (1 << 32) * n_capture))
+    result.check(
+        abs(peak_bin - expected_bin) <= 1,
+        f"NCO dominant frequency at FTW = {ftw:08X} (expected bin {expected_bin})",
+        f"got bin {peak_bin}"
+    )
+
+
+def check_fft_invariants(result: CheckResult):
+    """FFT structural sanity: impulse, roundtrip, peak position."""
+    print("\n--- FFT-2048 invariants (peak position, roundtrip, impulse) ---")
+    n = 2048
+    fft = FFTEngine(n=n)
+
+    # Impulse → flat spectrum at amplitude (no saturation; amp < 32767/N is overkill).
+    in_re = [1000] + [0] * (n - 1)
+    in_im = [0] * n
+    twin_re, twin_im = fft.compute(in_re, in_im, inverse=False)
+    flat_max = max(max(twin_re) - 1000, 1000 - min(twin_re),
+                   max(twin_im), -min(twin_im))
+    result.check(
+        flat_max <= 5,
+        "FFT-2048(impulse): all bins ≈ amplitude (1000)",
+        f"max |bin - 1000| = {flat_max}"
+    )
+
+    # Single COMPLEX tone (cos + j*sin) → single peak at bin_k (no conjugate
+    # at N-bin_k as you'd get with a real cosine). Amp small enough that
+    # peak = amp*N stays below Q15 saturation (32767).
+    bin_k = 137
+    amp = 15
+    in_re = [int(round(amp * math.cos(2 * math.pi * bin_k * i / n))) for i in range(n)]
+    in_im = [int(round(amp * math.sin(2 * math.pi * bin_k * i / n))) for i in range(n)]
+    twin_re, twin_im = fft.compute(in_re, in_im, inverse=False)
+    ref_re, ref_im = ref.fft_reference(in_re, in_im, n=n)
+    twin_mag2 = np.array(twin_re) ** 2 + np.array(twin_im) ** 2
+    ref_mag2 = np.asarray(ref_re) ** 2 + np.asarray(ref_im) ** 2
+    twin_peak = int(np.argmax(twin_mag2))
+    ref_peak = int(np.argmax(ref_mag2))
+    result.check(
+        twin_peak == ref_peak == bin_k,
+        f"FFT-2048(complex tone): peak at bin {bin_k}",
+        f"twin={twin_peak}, ref={ref_peak}"
+    )
+
+    # Roundtrip — small amplitude (peak = amp*N/2 ≤ 32767 → amp ≤ 32) so the
+    # forward FFT does not saturate, then IFFT should recover input within
+    # 11×Q15 butterfly noise.
+    rt_amp = 30
+    in_re = [int(rt_amp * math.sin(2 * math.pi * 73 * i / n)) for i in range(n)]
+    in_im = [0] * n
+    fwd_re, fwd_im = fft.compute(in_re, in_im, inverse=False)
+    rt_re, _ = fft.compute(fwd_re, fwd_im, inverse=True)
+    rt_max_err = max(abs(rt_re[i] - in_re[i]) for i in range(n))
+    result.check(
+        rt_max_err <= TOL_FFT_ROUNDTRIP_LSB,
+        f"FFT-2048(roundtrip, amp={rt_amp}): FFT->IFFT recovers input within {TOL_FFT_ROUNDTRIP_LSB} LSB",
+        f"max |rt - in| = {rt_max_err}"
+    )
+
+
+def check_mf_invariants(result: CheckResult):
+    """MF: peak must land at the injected delay; both twin and ref must agree."""
+    print("\n--- Matched filter invariants (peak position) ---")
+    n = 2048
+    mf = MatchedFilterChain(fft_size=n)
+
+    delay = 100
+    bin_k = 17
+    amp = 200
+    sig_re = [0] * n
+    sig_im = [0] * n
+    ref_re_in = [0] * n
+    ref_im_in = [0] * n
+    pulse_len = 256
+    for i in range(pulse_len):
+        ref_re_in[i] = int(round(amp * math.cos(2 * math.pi * bin_k * i / pulse_len)))
+        ref_im_in[i] = int(round(amp * math.sin(2 * math.pi * bin_k * i / pulse_len)))
+        sig_re[i + delay] = ref_re_in[i]
+        sig_im[i + delay] = ref_im_in[i]
+
+    twin_re, twin_im = mf.process(sig_re, sig_im, ref_re_in, ref_im_in)
+    ref_real, ref_imag = ref.matched_filter_reference(
+        sig_re, sig_im, ref_re_in, ref_im_in, fft_size=n
+    )
+    twin_mag = np.sqrt(np.array(twin_re) ** 2 + np.array(twin_im) ** 2)
+    ref_mag = np.sqrt(np.asarray(ref_real) ** 2 + np.asarray(ref_imag) ** 2)
+    twin_peak = int(np.argmax(twin_mag))
+    ref_peak = int(np.argmax(ref_mag))
+    result.check(
+        twin_peak == ref_peak == delay,
+        f"MF: peak at injected delay (bin {delay})",
+        f"twin={twin_peak}, ref={ref_peak}"
+    )
+
+    # Sidelobe behaviour: peak should be N×stronger than median.
+    twin_peak_val = float(twin_mag[delay])
+    twin_median = float(np.median(twin_mag))
+    pk_ratio = twin_peak_val / max(twin_median, 1.0)
+    result.check(
+        pk_ratio >= 5.0,
+        "MF peak-to-median ratio ≥ 5",
+        f"got ratio {pk_ratio:.2f}"
+    )
+
+
+def check_doppler_invariants(result: CheckResult):
+    """Doppler: peak per sub-frame must be at the injected Doppler bin."""
+    print("\n--- Doppler invariants (peak per sub-frame) ---")
+    chirps_per_frame = 48
+    range_bins = 16  # keep small; algorithm identical at any size
+    num_subframes = 3
+    chirps_per_subframe = 16
+
+    inject = [(0, 5), (1, 11), (2, 3)]
+    target_rbin = 10
+    amp = 1500  # avoid Q15 saturation: peak = amp * N/2 = 12000 < 32767
+    chirp_i = np.zeros((chirps_per_frame, range_bins), dtype=np.int64)
+    chirp_q = np.zeros((chirps_per_frame, range_bins), dtype=np.int64)
+    for sf, dop_bin in inject:
+        for c in range(chirps_per_subframe):
+            chirp_idx = sf * chirps_per_subframe + c
+            phase = 2 * math.pi * dop_bin * c / chirps_per_subframe
+            chirp_i[chirp_idx, target_rbin] = int(round(amp * math.cos(phase)))
+            chirp_q[chirp_idx, target_rbin] = int(round(amp * math.sin(phase)))
+
+    dop = DopplerProcessor(num_subframes=num_subframes,
+                           chirps_per_frame=chirps_per_frame)
+    dop.RANGE_BINS = range_bins
+    twin_dop_i, twin_dop_q = dop.process_frame(
+        chirp_i.tolist(), chirp_q.tolist()
+    )
+    ref_re, ref_im = ref.doppler_reference(
+        chirp_i, chirp_q,
+        num_subframes=num_subframes,
+        chirps_per_subframe=chirps_per_subframe,
+        range_bins=range_bins,
+    )
+
+    twin_mag = np.sqrt(np.array(twin_dop_i) ** 2 + np.array(twin_dop_q) ** 2)
+    ref_mag = np.sqrt(ref_re ** 2 + ref_im ** 2)
+    expected_bins = [sf * chirps_per_subframe + b for sf, b in inject]
+
+    sf_peaks_twin = []
+    sf_peaks_ref = []
+    for sf in range(num_subframes):
+        lo = sf * chirps_per_subframe
+        hi = lo + chirps_per_subframe
+        sf_peaks_twin.append(lo + int(np.argmax(twin_mag[target_rbin, lo:hi])))
+        sf_peaks_ref.append(lo + int(np.argmax(ref_mag[target_rbin, lo:hi])))
+
+    result.check(
+        sf_peaks_twin == expected_bins,
+        f"Doppler twin: peaks per sub-frame at {expected_bins}",
+        f"got {sf_peaks_twin}"
+    )
+    result.check(
+        sf_peaks_ref == expected_bins,
+        f"Doppler reference: peaks per sub-frame at {expected_bins}",
+        f"got {sf_peaks_ref}"
+    )
+
+
+# =============================================================================
+# Main
+# =============================================================================
+
+def main():
+    print("============================================================")
+    print("  T-6 INDEPENDENT REFERENCE DRIFT COSIM")
+    print("    fpga_model.py (twin) vs fpga_reference.py (numpy ideal)")
+    print("============================================================")
+    result = CheckResult()
+
+    # 1. LUT bytewise spot-checks (highest-value transcription detector)
+    check_nco_lut(result)
+    check_twiddle_rom(result, n=16,   mem_filename="fft_twiddle_16.mem")
+    check_twiddle_rom(result, n=2048, mem_filename="fft_twiddle_2048.mem")
+    check_hamming_lut(result)
+
+    # 2/3. End-to-end invariants
+    check_nco_invariants(result)
+    check_fft_invariants(result)
+    check_mf_invariants(result)
+    check_doppler_invariants(result)
+
+    ok = result.summary()
+    return 0 if ok else 1
+
+
+if __name__ == '__main__':
+    sys.exit(main())
diff --git a/9_Firmware/9_2_FPGA/tb/cosim/fpga_reference.py b/9_Firmware/9_2_FPGA/tb/cosim/fpga_reference.py
new file mode 100644
index 0000000..c913a75
--- /dev/null
+++ b/9_Firmware/9_2_FPGA/tb/cosim/fpga_reference.py
@@ -0,0 +1,245 @@
+#!/usr/bin/env python3
+r"""
+Independent floating-point reference for AERIS-10 signal processing chain.
+
+Unlike fpga_model.py (which is a bit-exact PORT of the RTL — same NCO LUT,
+same twiddle ROMs, same Q15 quantization), this module computes the
+algorithm using ideal numpy/scipy primitives. It is the second leg of the
+T-6 three-way triangulation:
+
+    fpga_model.py (twin)  --bit-exact--  RTL simulation
+              \                            /
+               \                          /
+                \-- within tolerance --> /
+                  fpga_reference.py
+                  (this file)
+
+Drift signal interpretation:
+  * twin == reference (within tol) and twin == RTL (bit-exact)
+        -> chain healthy
+  * twin == reference (within tol) but twin != RTL (bit-exact)
+        -> RTL diverged from spec (real RTL bug)
+  * twin != reference (outside tol) but twin == RTL (bit-exact)
+        -> twin and RTL share a transcription error
+           (e.g. wrong NCO_SINE_LUT entry, wrong twiddle ROM, wrong window
+           coefficients). This is the bug class T-6 was opened to catch.
+
+Coverage in this revision (highest transcription risk first):
+  * NCO  — numpy.cos/sin vs the 64-entry quarter-wave NCO_SINE_LUT
+  * FFT  — numpy.fft.fft/ifft vs Q15 twiddle ROM butterfly
+  * MF   — numpy ifft(fft(sig) * conj(fft(ref))) vs RTL pipeline
+  * Doppler — numpy.fft + ideal Hamming window vs Q15 window LUT + RTL FFT
+
+Out of scope here (lower transcription risk, deferred):
+  * CIC / FIR — coefficient files are derived from independent generators
+  * Mixer / RangeBinDecimator — trivially correct in both
+"""
+
+from __future__ import annotations
+
+import numpy as np
+
+
+# =============================================================================
+# NCO reference — ideal complex sinusoid
+# =============================================================================
+
+def nco_reference(num_samples: int, ftw: int, fs: float = 400e6,
+                  phase_offset_deg: float = 0.0):
+    """Ideal floating-point NCO output, scaled to match Q15 fpga_model.
+
+    The fpga_model.NCO uses a 32-bit phase accumulator stepped by ftw, with
+    a 64-entry quarter-wave cos LUT scaled to ±0x7FFF. This reference uses
+    numpy.cos/sin directly with no LUT.
+
+    Args:
+        num_samples: number of samples to produce
+        ftw: 32-bit unsigned phase increment per sample
+        fs: sample rate (Hz)
+        phase_offset_deg: phase offset (degrees), default 0
+
+    Returns:
+        (cos_q15, sin_q15) — float arrays of length num_samples, in Q15 scale
+    """
+    ftw = int(ftw) & 0xFFFFFFFF
+    n = np.arange(num_samples, dtype=np.float64)
+    # Phase accumulator semantics: phase[k] = (k * ftw) mod 2^32
+    # Convert to radians: 2*pi * phase / 2^32
+    phase_rad = 2.0 * np.pi * (n * ftw) / (1 << 32) + np.deg2rad(phase_offset_deg)
+    cos_q15 = np.cos(phase_rad) * 32767.0
+    sin_q15 = np.sin(phase_rad) * 32767.0
+    return cos_q15, sin_q15
+
+
+# =============================================================================
+# FFT reference — numpy.fft.fft / ifft
+# =============================================================================
+
+def fft_reference(in_re, in_im, n: int = 2048, inverse: bool = False):
+    """Ideal floating-point FFT.
+
+    Scaling matches the RTL convention:
+      forward: y[k] = sum_n x[n] * exp(-j*2*pi*k*n/N)            (no 1/N)
+      inverse: y[n] = (1/N) * sum_k X[k] * exp(+j*2*pi*k*n/N)    (1/N applied)
+
+    The RTL fft_engine implements >>>LOG2N before output saturation when
+    inverse=1, which is the same 1/N. numpy.fft.ifft already includes the
+    1/N factor, so we use it directly with no rescaling.
+
+    Args:
+        in_re/in_im: length-N int or float sequences
+        n: FFT size (16, 1024, 2048)
+        inverse: True for IFFT
+
+    Returns:
+        (out_re, out_im) — float arrays, no Q15 saturation
+    """
+    re = np.asarray(in_re, dtype=np.float64)
+    im = np.asarray(in_im, dtype=np.float64)
+    if len(re) != n or len(im) != n:
+        raise ValueError(f"input length {len(re)} != N={n}")
+    x = re + 1j * im
+    y = np.fft.ifft(x) if inverse else np.fft.fft(x)
+    return y.real.copy(), y.imag.copy()
+
+
+# =============================================================================
+# Matched filter reference — IFFT(FFT(sig) * conj(FFT(ref)))
+# =============================================================================
+
+def matched_filter_reference(sig_re, sig_im, ref_re, ref_im, fft_size: int = 2048):
+    """Ideal range profile via FFT-domain matched filter.
+
+    range_profile = IFFT( FFT(sig) * conj(FFT(ref)) )
+
+    The RTL pipeline does the same operation but with Q15 twiddles, 32-bit
+    accumulators, and Q15 round-and-saturate after the conjugate multiply.
+    This reference is unquantized.
+
+    Args:
+        sig_re/im, ref_re/im: length-fft_size sequences (int or float)
+        fft_size: matched filter FFT size (default 2048)
+
+    Returns:
+        (range_re, range_im) — float arrays, no Q15 saturation
+    """
+    sig_re = np.asarray(sig_re, dtype=np.float64)
+    sig_im = np.asarray(sig_im, dtype=np.float64)
+    ref_re = np.asarray(ref_re, dtype=np.float64)
+    ref_im = np.asarray(ref_im, dtype=np.float64)
+    s = sig_re + 1j * sig_im
+    r = ref_re + 1j * ref_im
+    S = np.fft.fft(s, n=fft_size)
+    R = np.fft.fft(r, n=fft_size)
+    P = S * np.conj(R)
+    p = np.fft.ifft(P)
+    return p.real.copy(), p.imag.copy()
+
+
+# =============================================================================
+# Doppler reference — Hamming-windowed per-sub-frame 16-pt FFT
+# =============================================================================
+
+def hamming_16_ideal():
+    """Ideal Hamming(16) coefficients, scaled to Q15.
+
+    fpga_model.HAMMING_WINDOW is a hard-coded Q15 LUT. If a coefficient was
+    transcribed wrong, this catches it.
+
+    Definition (matches scipy.signal.windows.hamming(16, sym=False) is
+    DIFFERENT — that's a periodic Hamming. The RTL/fpga_model use the
+    classic symmetric form: w[n] = 0.54 - 0.46*cos(2*pi*n/(N-1)).
+    """
+    n = np.arange(16, dtype=np.float64)
+    w = 0.54 - 0.46 * np.cos(2.0 * np.pi * n / 15.0)
+    return w * 32767.0
+
+
+def doppler_reference(chirp_data_i, chirp_data_q,
+                      num_subframes: int = 3,
+                      chirps_per_subframe: int = 16,
+                      range_bins: int = 512):
+    """Ideal Doppler map using ideal Hamming + numpy.fft, no Q15 quantization.
+
+    Args:
+        chirp_data_i/q: 2D arrays [chirps_per_frame][range_bins], int or float
+        num_subframes: number of independent 16-pt FFTs per range bin (default 3)
+        chirps_per_subframe: 16
+        range_bins: 512
+
+    Returns:
+        (doppler_map_re, doppler_map_im) — 2D float arrays
+            shape [range_bins][num_subframes * chirps_per_subframe]
+        Sub-frame s occupies output bins [s*16 .. s*16+15].
+    """
+    chirp_data_i = np.asarray(chirp_data_i, dtype=np.float64)
+    chirp_data_q = np.asarray(chirp_data_q, dtype=np.float64)
+    chirps_per_frame = num_subframes * chirps_per_subframe
+    if chirp_data_i.shape != (chirps_per_frame, range_bins):
+        raise ValueError(
+            f"chirp_data_i shape {chirp_data_i.shape} != "
+            f"({chirps_per_frame}, {range_bins})"
+        )
+
+    win = hamming_16_ideal()  # Q15-scaled
+    total_bins = num_subframes * chirps_per_subframe
+    out_re = np.zeros((range_bins, total_bins), dtype=np.float64)
+    out_im = np.zeros((range_bins, total_bins), dtype=np.float64)
+
+    for rbin in range(range_bins):
+        for sf in range(num_subframes):
+            start = sf * chirps_per_subframe
+            stop = start + chirps_per_subframe
+            offset = sf * chirps_per_subframe
+
+            # Apply Hamming and divide by 32768 to undo the Q15 scaling so
+            # the comparison is to ideal floating-point amplitudes (the RTL
+            # rounds (data*win + 1<<14) >> 15 which is an approximate /32768).
+            x_re = chirp_data_i[start:stop, rbin] * win / 32768.0
+            x_im = chirp_data_q[start:stop, rbin] * win / 32768.0
+            x = x_re + 1j * x_im
+
+            X = np.fft.fft(x)
+            out_re[rbin, offset:offset + chirps_per_subframe] = X.real
+            out_im[rbin, offset:offset + chirps_per_subframe] = X.imag
+
+    return out_re, out_im
+
+
+# =============================================================================
+# Self-test (sanity checks against numpy's own analytical answers)
+# =============================================================================
+
+def _self_test():
+    """Quick sanity checks."""
+    # NCO: at FTW = 0x4CCCCCCD, frequency = 0.3 * fs = 120 MHz at 400 MSPS.
+    cos_q15, sin_q15 = nco_reference(8, 0x4CCCCCCD, fs=400e6)
+    # First sample should be cos(0)=1, sin(0)=0 in Q15
+    assert abs(cos_q15[0] - 32767.0) < 1.0, f"NCO[0].cos = {cos_q15[0]}"
+    assert abs(sin_q15[0]) < 1.0, f"NCO[0].sin = {sin_q15[0]}"
+
+    # FFT: impulse -> all bins = amplitude
+    in_re = [1000] + [0] * 15
+    in_im = [0] * 16
+    out_re, out_im = fft_reference(in_re, in_im, n=16)
+    for k in range(16):
+        assert abs(out_re[k] - 1000.0) < 1e-9, f"FFT impulse bin {k}: {out_re[k]}"
+
+    # Doppler: zero input -> zero output
+    z_i = np.zeros((48, 512))
+    z_q = np.zeros((48, 512))
+    d_re, d_im = doppler_reference(z_i, z_q, num_subframes=3,
+                                   chirps_per_subframe=16, range_bins=512)
+    assert np.max(np.abs(d_re)) < 1e-9
+    assert np.max(np.abs(d_im)) < 1e-9
+
+    # Hamming: peak at center, edges match formula
+    w = hamming_16_ideal()
+    assert abs(w[0] - 0.08 * 32767.0) < 0.5
+    assert abs(w[7] - (0.54 - 0.46 * np.cos(2 * np.pi * 7 / 15)) * 32767.0) < 0.5
+
+    print("fpga_reference self-test: OK")
+
+
+if __name__ == '__main__':
+    _self_test()