diff --git a/.gitignore b/.gitignore new file mode 100644 index 0000000..5121da1 --- /dev/null +++ b/.gitignore @@ -0,0 +1,31 @@ +work-dpi/ +dw* +site/* +*.ucdb +work/* +transcript +*.ini +*.wlf +*.vstf +wlft* +*nfs* +*.sig +*.dtb +*.dasm +obj_dir/* +/tmp* +*.dasm +build/ +trace_* +mems +*log +.bender +.dvt +.settings +*.o +install +.projectBender.lock +Bender.lock +logs +bin +hardware/deps/* diff --git a/.gitmodules b/.gitmodules new file mode 100644 index 0000000..4351438 --- /dev/null +++ b/.gitmodules @@ -0,0 +1,3 @@ +[submodule "hardware/deps/spatz"] + path = hardware/deps/spatz + url = https://github.com/pulp-platform/spatz.git diff --git a/Bender.local b/Bender.local new file mode 100644 index 0000000..6971f39 --- /dev/null +++ b/Bender.local @@ -0,0 +1,7 @@ +# Copyright 2023 ETH Zurich and University of Bologna. +# Licensed under the Apache License, Version 2.0, see LICENSE for details. +# SPDX-License-Identifier: Apache-2.0 + +overrides: + axi: { git: "https://github.com/pulp-platform/axi.git", version: =0.39.1-beta } + register_interface: { git: "https://github.com/pulp-platform/register_interface.git", version: 0.3.8 } diff --git a/Bender.yml b/Bender.yml new file mode 100644 index 0000000..aaf8a53 --- /dev/null +++ b/Bender.yml @@ -0,0 +1,33 @@ +# Copyright 2025 ETH Zurich and University of Bologna. +# Licensed under the Apache License, Version 2.0, see LICENSE for details. +# SPDX-License-Identifier: Apache-2.1 + +package: + name: cachepool + +dependencies: + axi: { git: "https://github.com/pulp-platform/axi.git", version: =0.39.1-beta } + axi_riscv_atomics: { git: "https://github.com/pulp-platform/axi_riscv_atomics.git", version: 0.7.0 } + common_cells: { git: "https://github.com/pulp-platform/common_cells.git", version: 1.28.0 } + FPnew: { git: "https://github.com/pulp-platform/cvfpu.git", rev: pulp-v0.1.3 } + idma: { git: "https://github.com/pulp-platform/iDMA.git", version: 0.4.2 } + register_interface: { git: "https://github.com/pulp-platform/register_interface.git", version: 0.3.8 } + riscv-dbg: { git: "https://github.com/pulp-platform/riscv-dbg.git", version: 0.7.0 } + tech_cells_generic: { git: "https://github.com/pulp-platform/tech_cells_generic.git", version: 0.2.11 } + insitu-cache: { git: "git@iis-git.ee.ethz.ch:flamingo/spatz_cache_wrapper.git", rev: main } + spatz: { path: "hardware/deps/spatz" } + # dram_rtl_sim: { path: "hardware/deps/dram_rtl_sim" } + +workspace: + checkout_dir: "./hardware/deps" + +export_include_dirs: + +sources: + - hardware/src/tcdm_cache_interco.sv + - hardware/src/cachepool_pkg.sv + - hardware/src/cachepool_tile.sv + - hardware/src/cachepool_cluster.sv + # testbench + - hardware/tb/cachepool_cluster_wrapper.sv + - hardware/tb/testharness.sv diff --git a/Makefile b/Makefile new file mode 100644 index 0000000..b234715 --- /dev/null +++ b/Makefile @@ -0,0 +1,307 @@ +# Copyright 2025 ETH Zurich and University of Bologna. +# Licensed under the Apache License, Version 2.0, see LICENSE for details. +# SPDX-License-Identifier: Apache-2.0 + +# Author: Diyou Shen, ETH Zurich + +# Base Directory +SHELL = /usr/bin/env bash +ROOT_DIR := $(patsubst %/,%, $(dir $(abspath $(lastword $(MAKEFILE_LIST))))) +CACHEPOOL_DIR := $(shell git rev-parse --show-toplevel 2>/dev/null || echo $$CACHEPOOL_DIR) + + +# Directoriy Path +PYTHON ?= python3.6 + +## Spatz related +SPATZ_DIR ?= $(CACHEPOOL_DIR)/hardware/deps/spatz +SPZ_CLS_DIR ?= ${SPATZ_DIR}/hw/system/spatz_cluster + +## Toolchain related +TOOLCHAIN_DIR ?= ${SOFTWARE_DIR}/toolchain +INSTALL_PREFIX ?= install +INSTALL_DIR ?= ${ROOT_DIR}/${INSTALL_PREFIX} +GCC_INSTALL_DIR ?= ${INSTALL_DIR}/riscv-gcc +ISA_SIM_INSTALL_DIR ?= ${INSTALL_DIR}/riscv-isa-sim +LLVM_INSTALL_DIR ?= ${INSTALL_DIR}/llvm +HALIDE_INSTALL_DIR ?= ${INSTALL_DIR}/halide +BENDER_INSTALL_DIR ?= ${INSTALL_DIR}/bender +VERILATOR_INSTALL_DIR ?= ${INSTALL_DIR}/verilator +RISCV_TESTS_DIR ?= ${ROOT_DIR}/${SOFTWARE_DIR}/riscv-tests + +## Software related +SOFTWARE_DIR ?= ${CACHEPOOL_DIR}/software +SPATZ_SW_DIR ?= ${SPATZ_DIR}/sw + +## Simulation related +SIM_DIR ?= ${CACHEPOOL_DIR}/sim +### local c lib for simulation +SIMLIB_DIR ?= ${SIM_DIR}/simlib +### Snitch testbench c lib for simulation +SNLIB_DIR ?= ${SPATZ_DIR}/hw/ip/snitch_test/src +### Spatz bootrom c lib for simulation +BOOTLIB_DIR ?= ${SPZ_CLS_DIR}/test +### QuestaSim work directory +WORK_DIR ?= ${SIM_DIR}/work +SIMBIN_DIR ?= ${SIM_DIR}/bin +## Bender +BENDER ?= ${BENDER_INSTALL_DIR}/bender +CACHE_PATH := $(shell $(BENDER) path insitu-cache) + +# Configurations +CFG_DIR ?= ${CACHEPOOL_DIR}/cfg +CFG ?= cachepool.hjson + +# Tools +COMPILER ?= llvm + +CMAKE ?= cmake +# CC and CXX are Makefile default variables that are always defined in a Makefile. Hence, overwrite +# the variable if it is only defined by the Makefile (its origin in the Makefile's default). +ifeq ($(origin CC),default) + CC ?= gcc +endif +ifeq ($(origin CXX),default) + CXX ?= g++ +endif + +############ +# Bender # +############ + +BENDER_VERSION = 0.28.1 + +bender: check-bender +check-bender: + @if [ -x $(BENDER_INSTALL_DIR)/bender ]; then \ + req="bender $(BENDER_VERSION)"; \ + current="$$($(BENDER_INSTALL_DIR)/bender --version)"; \ + if [ "$$(printf '%s\n' "$${req}" "$${current}" | sort -V | head -n1)" != "$${req}" ]; then \ + rm -rf $(BENDER_INSTALL_DIR); \ + fi \ + fi + @$(MAKE) -C $(ROOT_DIR) $(BENDER_INSTALL_DIR)/bender + +$(BENDER_INSTALL_DIR)/bender: + mkdir -p $(BENDER_INSTALL_DIR) && cd $(BENDER_INSTALL_DIR) && \ + curl --proto '=https' --tlsv1.2 https://pulp-platform.github.io/bender/init -sSf | sh -s -- $(BENDER_VERSION) + + +############### +# Toolchain # +############### + +toolchain: download tc-llvm tc-riscv-gcc + +.PHONY: download +download: ${TOOLCHAIN_DIR}/riscv-gnu-toolchain ${TOOLCHAIN_DIR}/llvm-project ${TOOLCHAIN_DIR}/riscv-opcodes ${TOOLCHAIN_DIR}/riscv-isa-sim ${TOOLCHAIN_DIR}/dtc + + +${TOOLCHAIN_DIR}/riscv-gnu-toolchain: ${TOOLCHAIN_DIR}/riscv-gnu-toolchain.version + mkdir -p ${TOOLCHAIN_DIR} + cd ${TOOLCHAIN_DIR} && git clone https://github.com/pulp-platform/pulp-riscv-gnu-toolchain.git riscv-gnu-toolchain + cd ${TOOLCHAIN_DIR}/riscv-gnu-toolchain && \ + git checkout `cat ../riscv-gnu-toolchain.version` && \ + git submodule update --init --recursive --jobs=8 . + +${TOOLCHAIN_DIR}/llvm-project: ${TOOLCHAIN_DIR}/llvm-project.version + mkdir -p ${TOOLCHAIN_DIR} + cd ${TOOLCHAIN_DIR} && git clone https://github.com/mp-17/llvm-project.git + cd ${TOOLCHAIN_DIR}/llvm-project && \ + git checkout `cat ../llvm-project.version` && \ + git submodule update --init --recursive --jobs=8 . + +${TOOLCHAIN_DIR}/riscv-opcodes: ${TOOLCHAIN_DIR}/riscv-opcodes.version + mkdir -p ${TOOLCHAIN_DIR} + cd ${TOOLCHAIN_DIR} && git clone https://github.com/mp-17/riscv-opcodes.git + cd ${TOOLCHAIN_DIR}/riscv-opcodes && \ + git checkout `cat ../riscv-opcodes.version` && \ + git submodule update --init --recursive --jobs=8 . + +${TOOLCHAIN_DIR}/riscv-isa-sim: ${TOOLCHAIN_DIR}/riscv-isa-sim.version + mkdir -p ${TOOLCHAIN_DIR} + cd ${TOOLCHAIN_DIR} && git clone https://github.com/riscv-software-src/riscv-isa-sim.git + cd ${TOOLCHAIN_DIR}/riscv-isa-sim && \ + git checkout `cat ../riscv-isa-sim.version` && \ + git submodule update --init --recursive --jobs=8 . + +${TOOLCHAIN_DIR}/dtc: + mkdir -p ${TOOLCHAIN_DIR}/dtc + cd ${TOOLCHAIN_DIR}/dtc && wget -c https://git.kernel.org/pub/scm/utils/dtc/dtc.git/snapshot/dtc-1.7.0.tar.gz + cd ${TOOLCHAIN_DIR}/dtc && tar xf dtc-1.7.0.tar.gz + +tc-riscv-gcc: ${TOOLCHAIN_DIR}/riscv-gnu-toolchain + mkdir -p $(GCC_INSTALL_DIR) + cd ${TOOLCHAIN_DIR}/riscv-gnu-toolchain && rm -rf build && mkdir -p build && cd build && \ + ../configure --prefix=$(GCC_INSTALL_DIR) --with-arch=rv32imafd --with-abi=ilp32d --with-cmodel=medlow --enable-multilib && \ + $(MAKE) MAKEINFO=true -j4 + +tc-llvm: ${TOOLCHAIN_DIR}/llvm-project + mkdir -p $(LLVM_INSTALL_DIR) + cd ${TOOLCHAIN_DIR}/llvm-project && mkdir -p build && cd build; \ + $(CMAKE) \ + -DCMAKE_INSTALL_PREFIX=$(LLVM_INSTALL_DIR) \ + -DCMAKE_CXX_COMPILER=g++-8.2.0 \ + -DCMAKE_C_COMPILER=gcc-8.2.0 \ + -DLLVM_OPTIMIZED_TABLEGEN=True \ + -DLLVM_ENABLE_PROJECTS="clang;lld" \ + -DLLVM_TARGETS_TO_BUILD="RISCV" \ + -DLLVM_DEFAULT_TARGET_TRIPLE=riscv32-unknown-elf \ + -DLLVM_ENABLE_LLD=False \ + -DLLVM_APPEND_VC_REV=ON \ + -DCMAKE_BUILD_TYPE=Release \ + ../llvm && \ + make -j8 all && \ + make install + +tc-riscv-isa-sim: ${TOOLCHAIN_DIR}/riscv-isa-sim ${TOOLCHAIN_DIR}/dtc + mkdir -p $(ISA_SIM_INSTALL_DIR) + cd ${TOOLCHAIN_DIR}/dtc/dtc-1.7.0 && make install PREFIX=$(ISA_SIM_INSTALL_DIR) + cd ${ISA_SIM_INSTALL_DIR} && rm -rf build && mkdir -p build && cd build && \ + PATH=$(ISA_SIM_INSTALL_DIR)/bin:$(PATH) ../configure --prefix=$(ISA_SIM_INSTALL_DIR) && \ + $(MAKE) MAKEINFO=true -j4 install + + +############# +# Opcodes # +############# + +.PHONY: update_opcodes +update_opcodes: clean-opcodes ${TOOLCHAIN_DIR}/riscv-opcodes ${TOOLCHAIN_DIR}/riscv-opcodes/encoding.h ${SPATZ_DIR}/hw/ip/snitch/src/riscv_instr.sv + +clean-opcodes: + rm -rf ${TOOLCHAIN_DIR}/riscv-opcodes + +${SPATZ_DIR}hw/ip/snitch/src/riscv_instr.sv: ${TOOLCHAIN_DIR}/riscv-opcodes + MY_OPCODES=$(OPCODES) make -C ${TOOLCHAIN_DIR}/riscv-opcodes inst.sverilog + mv ${TOOLCHAIN_DIR}/riscv-opcodes/inst.sverilog $@ + +${TOOLCHAIN_DIR}/riscv-opcodes/encoding.h: + MY_OPCODES=$(OPCODES) make -C ${TOOLCHAIN_DIR}/riscv-opcodes all + cp ${TOOLCHAIN_DIR}/riscv-opcodes/encoding_out.h $@ + + +################# +# Prerequisites # +################# + +# Initialize, setup the toolchain for Spatz +init: + git submodule update --init --recursive --jobs=8 + +quick-tool: + ln -sf /usr/scratch2/calanda/diyou/toolchain/cachepool/install $(CACHEPOOL_DIR)/install + +# Build bootrom and spatz +.PHONY: generate +generate: update_opcodes + echo $(CFG_DIR)/$(CFG) + make -BC $(SPZ_CLS_DIR) generate bootrom SPATZ_CLUSTER_CFG=$(CFG_DIR)/$(CFG) BENDER=$(BENDER) + +.PHONY: cache-init +cache-init: + cd ${CACHE_PATH} && source sourceme.sh + +###### +# SW # +###### + +## Delete sw/build +clean.sw: + rm -rf ${SOFTWARE_DIR}/build + +## Build SW into sw/build with the LLVM toolchain +.PHONY: sw +sw: clean.sw + echo ${SOFTWARE_DIR} + mkdir -p ${SOFTWARE_DIR}/build + cd ${SOFTWARE_DIR}/build && ${CMAKE} \ + -DUSE_CACHE=${USE_CACHE} -DMEAS_1ITER=${USE_1ITER} -DPRINT_CHECK=${USE_PRINT} \ + -DENABLE_CACHEPOOL_TESTS=${ENABLE_CACHEPOOL_TESTS} -DCACHEPOOL_DIR=$(CACHEPOOL_DIR) \ + -DRUNTIME_DIR=${SOFTWARE_DIR} -DSPATZ_SW_DIR=$(SPATZ_SW_DIR) \ + -DLLVM_PATH=${LLVM_INSTALL_DIR} -DGCC_PATH=${GCC_INSTALL_DIR} -DPYTHON=${PYTHON} -DBUILD_TESTS=ON .. && make + +############ +# Modelsim # +############ +# QuestaSim +VSIM = questa-2021.3-kgf vsim +VLOG = questa-2021.3-kgf vlog +VSIM_HOME = /usr/pack/questa-2021.3-kgf/questasim + +# fesvr is being installed here +FESVR ?= ${SIM_DIR}/work +FESVR_VERSION ?= c663ea20a53f4316db8cb4d591b1c8e437f4a0c4 + +VSIM_FLAGS += -t 1ps +VSIM_FLAGS += -voptargs=+acc +VSIM_FLAGS += -suppress vsim-3999 +VSIM_FLAGS += -do "log -r /*; source ${SIM_DIR}/scripts/vsim_wave.tcl; run -a" + +VLOG_FLAGS += -svinputport=compat +VLOG_FLAGS += -override_timescale 1ns/1ps +VLOG_FLAGS += -suppress 2583 +VLOG_FLAGS += -suppress 13314 +VLOG_FLAGS += -64 + +USE_CACHE ?= 1 +USE_PRINT ?= 1 +USE_1ITER ?= 0 +ENABLE_CACHEPOOL_TESTS ?= 1 + +VSIM_BENDER += -t test -t rtl -t simulation -t spatz -t spatz_test -t snitch_test -t cachepool + +define QUESTASIM + ${VSIM} -c -do "source $<; quit" | tee $(dir $<)vsim.log + @! grep -P "Errors: [1-9]*," $(dir $<)vsim.log + @mkdir -p bin + @echo "#!/bin/bash" > $@ + @echo 'echo `realpath $$1` > ${SIMBIN_DIR}/logs/.rtlbinary' >> $@ + @echo '${VSIM} +permissive ${VSIM_FLAGS} -work ${WORK_DIR} -c \ + -ldflags "-Wl,-rpath,${FESVR}/lib -L${FESVR}/lib -lfesvr_vsim -lutil" \ + $1 +permissive-off ++$$1' >> $@ + @chmod +x $@ + @echo "#!/bin/bash" > $@.gui + @echo 'echo `realpath $$1` > ${SIMBIN_DIR}/logs/.rtlbinary' >> $@ + @echo '${VSIM} +permissive ${VSIM_FLAGS} -work ${WORK_DIR} \ + -ldflags "-Wl,-rpath,${FESVR}/lib -L${FESVR}/lib -lfesvr_vsim -lutil" \ + $1 +permissive-off ++$$1' >> $@.gui + @chmod +x $@.gui +endef + +${WORK_DIR}/${FESVR_VERSION}_unzip: + mkdir -p $(dir $@) + wget -O $(dir $@)/${FESVR_VERSION} https://github.com/riscv/riscv-isa-sim/tarball/${FESVR_VERSION} + tar xfm $(dir $@)${FESVR_VERSION} --strip-components=1 -C $(dir $@) + touch $@ + +${WORK_DIR}/lib/libfesvr_vsim.a: ${WORK_DIR}/${FESVR_VERSION}_unzip + cd $(dir $<)/ && PATH=${ISA_SIM_INSTALL_DIR}/bin:${PATH} CC=${VSIM_HOME}/gcc-7.4.0-linux_x86_64/bin/gcc CXX=${VSIM_HOME}/gcc-7.4.0-linux_x86_64/bin/g++ ./configure --prefix `pwd` + make -C $(dir $<) install-config-hdrs install-hdrs libfesvr.a + mkdir -p $(dir $@) + cp $(dir $<)libfesvr.a $@ + +${WORK_DIR}/compile.vsim.tcl: ${SNLIB_DIR}/rtl_lib.cc ${SNLIB_DIR}/common_lib.cc ${BOOTLIB_DIR}/bootdata.cc ${BOOTLIB_DIR}/bootrom.bin + vlib $(dir $@) + ${BENDER} script vsim ${VSIM_BENDER} ${DEFS} --vlog-arg="${VLOG_FLAGS} -work $(dir $@) " > $@ + echo '${VLOG} -work $(dir $@) ${SNLIB_DIR}/rtl_lib.cc ${SNLIB_DIR}/common_lib.cc ${BOOTLIB_DIR}/bootdata.cc -ccflags "-std=c++17 -I${BOOTLIB_DIR} -I${WORK_DIR}/include -I${SNLIB_DIR}"' >> $@ + echo '${VLOG} -work $(dir $@) ${BOOTLIB_DIR}/uartdpi/uartdpi.c -ccflags "-I${BOOTLIB_DIR}/uartdpi"' >> $@ + echo 'return 0' >> $@ + +${SIMBIN_DIR}/cachepool_cluster.vsim: ${WORK_DIR}/compile.vsim.tcl ${WORK_DIR}/lib/libfesvr_vsim.a + mkdir -p ${SIMBIN_DIR}/logs + $(call QUESTASIM,tb_bin) + +clean.vsim: + rm -rf ${WORK_DIR}/compile.vsim.tcl ${SIMBIN_DIR}/spatz_cluster.vsim ${SIMBIN_DIR}/spatz_cluster.vsim.gui ${SIM_DIR}/work-vsim ${WORK_DIR} vsim.wlf vish_stacktrace.vstf transcript + +.PHONY: vsim +vsim: clean.sw ${SIMBIN_DIR}/cachepool_cluster.vsim + echo ${SOFTWARE_DIR} + mkdir -p ${SOFTWARE_DIR}/build + cd ${SOFTWARE_DIR}/build && ${CMAKE} \ + -DUSE_CACHE=${USE_CACHE} -DMEAS_1ITER=${USE_1ITER} -DPRINT_CHECK=${USE_PRINT} \ + -DENABLE_CACHEPOOL_TESTS=${ENABLE_CACHEPOOL_TESTS} -DCACHEPOOL_DIR=$(CACHEPOOL_DIR) \ + -DRUNTIME_DIR=${SOFTWARE_DIR} -DSPATZ_SW_DIR=$(SPATZ_SW_DIR) \ + -DLLVM_PATH=${LLVM_INSTALL_DIR} -DGCC_PATH=${GCC_INSTALL_DIR} -DPYTHON=${PYTHON} \ + -DSNITCH_SIMULATOR=${SIMBIN_DIR}/cachepool_cluster.vsim -DBUILD_TESTS=ON .. && make diff --git a/README.md b/README.md index f5cc7e9..c13f38b 100644 --- a/README.md +++ b/README.md @@ -1 +1,56 @@ -# ManyRVData +# CachePool + +This reporistory is still under construction... +It will be transferred to GitHub once we have a working demo. + +## Get Started + +First, initialize and generated the needed hardware with: + +```bash +make init +make generate +``` + +LLVM and GCC toolchain are required to be built for the project. You can build the toolchain using + +```bash +make toolchain +```` + +Or, you can link the pre-built toolchain within ETH domain + +```bash +make quick-tool +```` + + +You can build the software only with: + +```bash +make sw +``` + +Or, build the software and hardware together with (only support QuestaSim for now): + +```bash +make vsim +``` + +The QuestaSim simulation can be run with: + +```bash +./sim/bin/spatz_cluster.vsim.gui ./software/build/TESTNAME +``` + +## Change configurations + +Currently the Runtime support is still under construction. Changing configurations require manual modifications on multiple files. +The `cfg/cachepool.hjson` provides the configuration to generate the vector core package. +In case you change some system variables, e.g. cache size, you are required to change the `hardware/src/cachepool_pkg.sv` where defines the elaboration variables at system level. +In some rare cases, you may also need to change `hardware/tb/cachepool_cluster_wrapper.sv` for the cached_region size and support (will be moved into `cachepool_pkg.sv` in the future). + +After modifying the files, you need to re-generate all the auto-generated files by: +```bash +make generate -B +``` diff --git a/cfg/cachepool.hjson b/cfg/cachepool.hjson new file mode 100644 index 0000000..4981485 --- /dev/null +++ b/cfg/cachepool.hjson @@ -0,0 +1,114 @@ +// Copyright 2025 ETH Zurich and University of Bologna. +// Licensed under the Apache License, Version 2.0, see LICENSE for details. +// SPDX-License-Identifier: Apache-2.0 + +// Cluster configuration for a simple system. +{ + cluster: { + mempool: 0, + boot_addr: 4096, // 0x1000 + cluster_base_addr: 1358954496, // 0x51000000 + cluster_base_offset: 0, // 0x0 + cluster_base_hartid: 16, + addr_width: 32, + data_width: 64, + id_width_in: 6, + id_width_out: 2, + user_width: 10, + axi_cdc_enable: false, + sw_rst_enable: true, + axi_isolate_enable: false, + tcdm: { + size: 256, + banks: 16, + }, + cluster_periph_size: 64, // kB + dma_data_width: 256, + dma_axi_req_fifo_depth: 3, + dma_req_fifo_depth: 3, + // Spatz parameters + vlen: 512, + n_fpu: 4, + n_ipu: 4, + spatz_fpu: true, + norvd: false, + // Timing parameters + timing: { + lat_comp_fp32: 1, + lat_comp_fp64: 2, + lat_comp_fp16: 1, + lat_comp_fp16_alt: 1, + lat_comp_fp8: 0, + lat_comp_fp8_alt: 0, + lat_noncomp: 1, + lat_conv: 2, + lat_sdotp: 4, + fpu_pipe_config: "BEFORE" + xbar_latency: "CUT_ALL_PORTS", + + register_tcdm_cuts: true, + register_core_req: true, + register_core_rsp: true, + register_offload_rsp: true + }, + cores: [ + { $ref: "#/dma_core_template" }, + { $ref: "#/compute_core_template" }, + { $ref: "#/compute_core_template" }, + { $ref: "#/compute_core_template" }, + ], + icache: { + size: 4, // total instruction cache size in kByte + sets: 2, // number of ways + cacheline: 128 // word size in bits + } + } + + dram: { + // 0x8000_0000 + address: 2147483648, + // 0x8000_0000 + length: 2147483648 + }, + l2: { + // 0x5180_0000 + address: 1367343104, + // 0x5200_0000 + length: 8388608 + }, + uncached_region: { + // 0x5200_0000 + address: 1375731712, + // 0x5280_0000 + length: 8388608 + }, + peripherals: { + }, + + // Templates. + + compute_core_template: { + isa: "rv32imafd", + xf16: true, + xf8: true, + xfdotp: true, + xdma: false, + num_int_outstanding_loads: 16, + num_int_outstanding_mem: 16, + num_spatz_outstanding_loads: 16, + num_dtlb_entries: 1, + num_itlb_entries: 1 + }, + dma_core_template: { + isa: "rv32imafd", + xdma: true + xf16: true, + xf8: true, + xfdotp: true, + num_int_outstanding_loads: 16, + num_int_outstanding_mem: 16, + num_spatz_outstanding_loads: 16, + num_dtlb_entries: 1, + num_itlb_entries: 1 + } +} diff --git a/hardware/deps/spatz b/hardware/deps/spatz new file mode 160000 index 0000000..b45d7ce --- /dev/null +++ b/hardware/deps/spatz @@ -0,0 +1 @@ +Subproject commit b45d7cec991e2612a5566f90cf0c888594a1ab8c diff --git a/hardware/src/cachepool_cluster.sv b/hardware/src/cachepool_cluster.sv new file mode 100644 index 0000000..d62f5fa --- /dev/null +++ b/hardware/src/cachepool_cluster.sv @@ -0,0 +1,424 @@ +// Copyright 2025 ETH Zurich and University of Bologna. +// Solderpad Hardware License, Version 0.51, see LICENSE for details. +// SPDX-License-Identifier: SHL-0.51 + +// Author: Diyou Shen + +`include "axi/assign.svh" +`include "axi/typedef.svh" +`include "common_cells/assertions.svh" +`include "common_cells/registers.svh" +`include "mem_interface/assign.svh" +`include "mem_interface/typedef.svh" +`include "register_interface//assign.svh" +`include "register_interface/typedef.svh" +`include "reqrsp_interface/assign.svh" +`include "reqrsp_interface/typedef.svh" +`include "snitch_vm/typedef.svh" +`include "tcdm_interface/assign.svh" +`include "tcdm_interface/typedef.svh" + +/// A single-tile cluster implementation for CachePool +module cachepool_cluster + import cachepool_pkg::*; + import spatz_pkg::*; + import fpnew_pkg::fpu_implementation_t; + import snitch_pma_pkg::snitch_pma_t; + #( + /// Width of physical address. + parameter int unsigned AxiAddrWidth = 48, + /// Width of AXI port. + parameter int unsigned AxiDataWidth = 512, + /// AXI: id width in. + parameter int unsigned AxiIdWidthIn = 2, + /// AXI: id width out. + parameter int unsigned AxiIdWidthOut = 2, + /// AXI: user width. + parameter int unsigned AxiUserWidth = 1, + /// Address from which to fetch the first instructions. + parameter logic [31:0] BootAddr = 32'h0, + /// Address to indicate start of L2 + parameter logic [AxiAddrWidth-1:0] L2Addr = 48'h0, + parameter logic [AxiAddrWidth-1:0] L2Size = 48'h0, + /// The total amount of cores. + parameter int unsigned NrCores = 8, + /// Data/TCDM memory depth per cut (in words). + parameter int unsigned TCDMDepth = 1024, + /// Cluster peripheral address region size (in kB). + parameter int unsigned ClusterPeriphSize = 64, + /// Number of TCDM Banks. + parameter int unsigned NrBanks = 2 * NrCores, + /// Size of DMA AXI buffer. + parameter int unsigned DMAAxiReqFifoDepth = 3, + /// Size of DMA request fifo. + parameter int unsigned DMAReqFifoDepth = 3, + /// Width of a single icache line. + parameter unsigned ICacheLineWidth = 0, + /// Number of icache lines per set. + parameter int unsigned ICacheLineCount = 0, + /// Number of icache sets. + parameter int unsigned ICacheSets = 0, + // PMA Configuration + parameter snitch_pma_t SnitchPMACfg = '{default: 0}, + /// # Core-global parameters + /// FPU configuration. + parameter fpu_implementation_t FPUImplementation [NrCores] = '{default: fpu_implementation_t'(0)}, + /// Spatz FPU/IPU Configuration + parameter int unsigned NumSpatzFPUs = 4, + parameter int unsigned NumSpatzIPUs = 1, + /// Per-core enabling of the custom `Xdma` ISA extensions. + parameter bit [NrCores-1:0] Xdma = '{default: '0}, + /// # Per-core parameters + /// Per-core integer outstanding loads + parameter int unsigned NumIntOutstandingLoads [NrCores] = '{default: '0}, + /// Per-core integer outstanding memory operations (load and stores) + parameter int unsigned NumIntOutstandingMem [NrCores] = '{default: '0}, + /// Per-core Spatz outstanding loads + parameter int unsigned NumSpatzOutstandingLoads [NrCores] = '{default: '0}, + /// ## Timing Tuning Parameters + /// Insert Pipeline registers into off-loading path (response) + parameter bit RegisterOffloadRsp = 1'b0, + /// Insert Pipeline registers into data memory path (request) + parameter bit RegisterCoreReq = 1'b0, + /// Insert Pipeline registers into data memory path (response) + parameter bit RegisterCoreRsp = 1'b0, + /// Insert Pipeline registers after each memory cut + parameter bit RegisterTCDMCuts = 1'b0, + /// Decouple external AXI plug + parameter bit RegisterExt = 1'b0, + parameter axi_pkg::xbar_latency_e XbarLatency = axi_pkg::CUT_ALL_PORTS, + /// Outstanding transactions on the AXI network + parameter int unsigned MaxMstTrans = 4, + parameter int unsigned MaxSlvTrans = 4, + /// # Interface + /// AXI Ports + parameter type axi_in_req_t = logic, + parameter type axi_in_resp_t = logic, + parameter type axi_out_req_t = logic, + parameter type axi_out_resp_t = logic, + /// SRAM configuration + parameter type impl_in_t = logic, + // Memory latency parameter. Most of the memories have a read latency of 1. In + // case you have memory macros which are pipelined you want to adjust this + // value here. This only applies to the TCDM. The instruction cache macros will break! + // In case you are using the `RegisterTCDMCuts` feature this adds an + // additional cycle latency, which is taken into account here. + parameter int unsigned MemoryMacroLatency = 1 + RegisterTCDMCuts, + /// # SRAM Configuration rules needed: L1D Tag + L1D Data + L1D FIFO + L1I Tag + L1I Data + /*** ATTENTION: `NrSramCfg` should be changed if `L1NumDataBank` and `L1NumTagBank` is changed ***/ + parameter int unsigned NrSramCfg = 1 + ) ( + /// System clock. + input logic clk_i, + /// Asynchronous active high reset. This signal is assumed to be _async_. + input logic rst_ni, + /// Per-core debug request signal. Asserting this signals puts the + /// corresponding core into debug mode. This signal is assumed to be _async_. + input logic [NrCores-1:0] debug_req_i, + /// Machine external interrupt pending. Usually those interrupts come from a + /// platform-level interrupt controller. This signal is assumed to be _async_. + input logic [NrCores-1:0] meip_i, + /// Machine timer interrupt pending. Usually those interrupts come from a + /// core-local interrupt controller such as a timer/RTC. This signal is + /// assumed to be _async_. + input logic [NrCores-1:0] mtip_i, + /// Core software interrupt pending. Usually those interrupts come from + /// another core to facilitate inter-processor-interrupts. This signal is + /// assumed to be _async_. + input logic [NrCores-1:0] msip_i, + /// First hartid of the cluster. Cores of a cluster are monotonically + /// increasing without a gap, i.e., a cluster with 8 cores and a + /// `hart_base_id_i` of 5 get the hartids 5 - 12. + input logic [9:0] hart_base_id_i, + /// Base address of cluster. TCDM and cluster peripheral location are derived from + /// it. This signal is pseudo-static. + input logic [AxiAddrWidth-1:0] cluster_base_addr_i, + /// Per-cluster probe on the cluster status. Can be written by the cores to indicate + /// to the overall system that the cluster is executing something. + output logic [NumTiles-1:0] cluster_probe_o, + /// AXI Core cluster in-port. + input axi_in_req_t [NumTiles-1:0] axi_in_req_i, + output axi_in_resp_t [NumTiles-1:0] axi_in_resp_o, + /// AXI Core cluster out-port to core. + output axi_out_req_t axi_out_req_o, + input axi_out_resp_t axi_out_resp_i, + /// AXI Core cluster out-port to L2 Mem. + output axi_out_req_t axi_out_l2_req_o, + input axi_out_resp_t axi_out_l2_resp_i, + /// SRAM Configuration: L1D Data + L1D Tag + L1D FIFO + L1I Data + L1I Tag + input impl_in_t [NrSramCfg-1:0] impl_i, + /// Indicate the program execution is error + output logic error_o + ); + // --------- + // Imports + // --------- + import snitch_pkg::*; + import snitch_icache_pkg::icache_events_t; + + // --------- + // Constants + // --------- + /// Minimum width to hold the core number. + localparam int unsigned CoreIDWidth = cf_math_pkg::idx_width(NrCores); + + // Enlarge the address width for Spatz due to cache + localparam int unsigned TCDMAddrWidth = 32; + + // Core Request, SoC Request + localparam int unsigned NrNarrowMasters = 2; + + localparam int unsigned WideIdWidthOut = AxiIdWidthOut; + localparam int unsigned WideIdWidthIn = WideIdWidthOut - $clog2(NumClusterAxiMst); + + // Cache XBar configuration struct + localparam axi_pkg::xbar_cfg_t CacheXbarCfg = '{ + NoSlvPorts : NumClusterAxiMst*NumTiles, + NoMstPorts : NumClusterAxiSlv, + MaxMstTrans : MaxMstTrans, + MaxSlvTrans : MaxSlvTrans, + FallThrough : 1'b0, + LatencyMode : XbarLatency, + AxiIdWidthSlvPorts: WideIdWidthIn, + AxiIdUsedSlvPorts : WideIdWidthIn, + UniqueIds : 1'b0, + AxiAddrWidth : AxiAddrWidth, + AxiDataWidth : AxiDataWidth, + NoAddrRules : NumClusterAxiSlv - 1, + default : '0 + }; + + // -------- + // Typedefs + // -------- + typedef logic [AxiAddrWidth-1:0] addr_t; + typedef logic [AxiDataWidth-1:0] data_cache_t; + typedef logic [AxiDataWidth/8-1:0] strb_cache_t; + typedef logic [WideIdWidthIn-1:0] id_cache_mst_t; + typedef logic [WideIdWidthOut-1:0] id_cache_slv_t; + typedef logic [AxiUserWidth-1:0] user_cache_t; + + `AXI_TYPEDEF_ALL(axi_mst_cache, addr_t, id_cache_mst_t, data_cache_t, strb_cache_t, user_cache_t) + `AXI_TYPEDEF_ALL(axi_slv_cache, addr_t, id_cache_slv_t, data_cache_t, strb_cache_t, user_cache_t) + + `REG_BUS_TYPEDEF_ALL(reg_cache, addr_t, data_cache_t, strb_cache_t) + + typedef struct packed { + int unsigned idx; + addr_t start_addr; + addr_t end_addr; + } xbar_rule_t; + + `SNITCH_VM_TYPEDEF(AxiAddrWidth) + + // ----------- + // Assignments + // ----------- + // Calculate start and end address of TCDM based on the `cluster_base_addr_i`. + + addr_t cluster_l2_start_address, cluster_l2_end_address; + assign cluster_l2_start_address = L2Addr; + assign cluster_l2_end_address = L2Addr + L2Size; + + // ---------------- + // Wire Definitions + // ---------------- + // 1. AXI + axi_mst_cache_req_t [NumTiles*NumL1CacheCtrl-1 :0] axi_cache_req; + axi_mst_cache_resp_t [NumTiles*NumL1CacheCtrl-1 :0] axi_cache_rsp; + axi_mst_cache_req_t [NumTiles*NumTileWideAxi-1 :0] axi_tile_req; + axi_mst_cache_resp_t [NumTiles*NumTileWideAxi-1 :0] axi_tile_rsp; + axi_slv_cache_req_t [NumTiles*NumClusterAxiSlv-1 :0] wide_axi_slv_req; + axi_slv_cache_resp_t [NumTiles*NumClusterAxiSlv-1 :0] wide_axi_slv_rsp; + + // 2. BootROM + reg_cache_req_t bootrom_reg_req; + reg_cache_rsp_t bootrom_reg_rsp; + + // --------------- + // CachePool Tile + // --------------- + + for (genvar t = 0; t < NumTiles; t ++) begin : gen_tiles + cachepool_tile #( + .AxiAddrWidth ( AxiAddrWidth ), + .AxiDataWidth ( AxiDataWidth ), + .AxiIdWidthIn ( AxiIdWidthIn ), + .AxiIdWidthOut ( WideIdWidthIn ), + .AxiUserWidth ( AxiUserWidth ), + .BootAddr ( BootAddr ), + .L2Addr ( L2Addr ), + .L2Size ( L2Size ), + .ClusterPeriphSize ( ClusterPeriphSize ), + .NrCores ( NrCores ), + .TCDMDepth ( TCDMDepth ), + .NrBanks ( NrBanks ), + .ICacheLineWidth ( ICacheLineWidth ), + .ICacheLineCount ( ICacheLineCount ), + .ICacheSets ( ICacheSets ), + .FPUImplementation ( FPUImplementation ), + .NumSpatzFPUs ( NumSpatzFPUs ), + .NumSpatzIPUs ( NumSpatzIPUs ), + .SnitchPMACfg ( SnitchPMACfg ), + .NumIntOutstandingLoads ( NumIntOutstandingLoads ), + .NumIntOutstandingMem ( NumIntOutstandingMem ), + .NumSpatzOutstandingLoads ( NumSpatzOutstandingLoads ), + .axi_in_req_t ( axi_in_req_t ), + .axi_in_resp_t ( axi_in_resp_t ), + .axi_out_req_t ( axi_mst_cache_req_t ), + .axi_out_resp_t ( axi_mst_cache_resp_t ), + .Xdma ( Xdma ), + .DMAAxiReqFifoDepth ( DMAAxiReqFifoDepth ), + .DMAReqFifoDepth ( DMAReqFifoDepth ), + .RegisterOffloadRsp ( RegisterOffloadRsp ), + .RegisterCoreReq ( RegisterCoreReq ), + .RegisterCoreRsp ( RegisterCoreRsp ), + .RegisterTCDMCuts ( RegisterTCDMCuts ), + .RegisterExt ( RegisterExt ), + .XbarLatency ( XbarLatency ), + .MaxMstTrans ( MaxMstTrans ), + .MaxSlvTrans ( MaxSlvTrans ) + ) i_tile ( + .clk_i ( clk_i ), + .rst_ni ( rst_ni ), + .impl_i ( impl_i ), + .error_o ( ), + .debug_req_i ( debug_req_i ), + .meip_i ( meip_i ), + .mtip_i ( mtip_i ), + .msip_i ( msip_i ), + .hart_base_id_i ( hart_base_id_i ), + .cluster_base_addr_i ( cluster_base_addr_i ), + .tile_probe_o ( cluster_probe_o[t] ), + .axi_in_req_i ( axi_in_req_i [t] ), + .axi_in_resp_o ( axi_in_resp_o[t] ), + // AXI Master Port + .axi_cache_req_o ( axi_cache_req[t*NumL1CacheCtrl+:NumL1CacheCtrl] ), + .axi_cache_rsp_i ( axi_cache_rsp[t*NumL1CacheCtrl+:NumL1CacheCtrl] ), + .axi_wide_req_o ( axi_tile_req [t*NumTileWideAxi+:NumTileWideAxi] ), + .axi_wide_rsp_i ( axi_tile_rsp [t*NumTileWideAxi+:NumTileWideAxi] ) + ); + end + logic [CacheXbarCfg.NoSlvPorts-1:0][$clog2(CacheXbarCfg.NoMstPorts)-1:0] cache_xbar_default_port; + xbar_rule_t [CacheXbarCfg.NoAddrRules-1:0] cache_xbar_rule; + + assign cache_xbar_default_port = '{default: ClusterL2}; + assign cache_xbar_rule = '{ + '{ + idx : ClusterL2, + start_addr: cluster_l2_start_address, + end_addr : cluster_l2_end_address + } + }; + + localparam bit [CacheXbarCfg.NoSlvPorts-1:0] CacheEnDefaultMstPort = '1; + + axi_xbar #( + .Cfg (CacheXbarCfg ), + .ATOPs (0 ), + .slv_aw_chan_t (axi_mst_cache_aw_chan_t), + .mst_aw_chan_t (axi_slv_cache_aw_chan_t), + .w_chan_t (axi_mst_cache_w_chan_t ), + .slv_b_chan_t (axi_mst_cache_b_chan_t ), + .mst_b_chan_t (axi_slv_cache_b_chan_t ), + .slv_ar_chan_t (axi_mst_cache_ar_chan_t), + .mst_ar_chan_t (axi_slv_cache_ar_chan_t), + .slv_r_chan_t (axi_mst_cache_r_chan_t ), + .mst_r_chan_t (axi_slv_cache_r_chan_t ), + .slv_req_t (axi_mst_cache_req_t ), + .slv_resp_t (axi_mst_cache_resp_t ), + .mst_req_t (axi_slv_cache_req_t ), + .mst_resp_t (axi_slv_cache_resp_t ), + .rule_t (xbar_rule_t ) + ) i_cluster_xbar ( + .clk_i (clk_i ), + .rst_ni (rst_ni ), + .test_i (1'b0 ), + .slv_ports_req_i ({axi_cache_req, axi_tile_req[TileMem]} ), + .slv_ports_resp_o ({axi_cache_rsp, axi_tile_rsp[TileMem]} ), + .mst_ports_req_o (wide_axi_slv_req ), + .mst_ports_resp_i (wide_axi_slv_rsp ), + .addr_map_i (cache_xbar_rule ), + .en_default_mst_port_i (CacheEnDefaultMstPort ), + .default_mst_port_i (cache_xbar_default_port ) + ); + + + // ------------- + // DMA Subsystem + // ------------- + // Optionally decouple the external wide AXI master port. + axi_cut #( + .Bypass (!RegisterExt ), + .aw_chan_t (axi_slv_cache_aw_chan_t ), + .w_chan_t (axi_slv_cache_w_chan_t ), + .b_chan_t (axi_slv_cache_b_chan_t ), + .ar_chan_t (axi_slv_cache_ar_chan_t ), + .r_chan_t (axi_slv_cache_r_chan_t ), + .axi_req_t (axi_slv_cache_req_t ), + .axi_resp_t (axi_slv_cache_resp_t ) + ) i_cut_ext_wide_out ( + .clk_i (clk_i ), + .rst_ni (rst_ni ), + .slv_req_i (wide_axi_slv_req[ClusterL3]), + .slv_resp_o (wide_axi_slv_rsp[ClusterL3]), + .mst_req_o (axi_out_req_o ), + .mst_resp_i (axi_out_resp_i ) + ); + + axi_cut #( + .Bypass (!RegisterExt ), + .aw_chan_t (axi_slv_cache_aw_chan_t ), + .w_chan_t (axi_slv_cache_w_chan_t ), + .b_chan_t (axi_slv_cache_b_chan_t ), + .ar_chan_t (axi_slv_cache_ar_chan_t ), + .r_chan_t (axi_slv_cache_r_chan_t ), + .axi_req_t (axi_slv_cache_req_t ), + .axi_resp_t (axi_slv_cache_resp_t ) + ) i_cut_ext_l2_wide_out ( + .clk_i (clk_i ), + .rst_ni (rst_ni ), + .slv_req_i (wide_axi_slv_req[ClusterL2]), + .slv_resp_o (wide_axi_slv_rsp[ClusterL2]), + .mst_req_o (axi_out_l2_req_o ), + .mst_resp_i (axi_out_l2_resp_i ) + ); + + // --------- + // Slaves + // --------- + + // TODO: Add MUX for multi-Tile + // BootROM + axi_to_reg #( + .ADDR_WIDTH (AxiAddrWidth ), + .DATA_WIDTH (AxiDataWidth ), + .AXI_MAX_WRITE_TXNS (1 ), + .AXI_MAX_READ_TXNS (1 ), + .DECOUPLE_W (0 ), + .ID_WIDTH (WideIdWidthIn ), + .USER_WIDTH (AxiUserWidth ), + .axi_req_t (axi_mst_cache_req_t ), + .axi_rsp_t (axi_mst_cache_resp_t), + .reg_req_t (reg_cache_req_t ), + .reg_rsp_t (reg_cache_rsp_t ) + ) i_axi_to_reg_bootrom ( + .clk_i (clk_i ), + .rst_ni (rst_ni ), + .testmode_i (1'b0 ), + .axi_req_i (axi_tile_req[TileBootROM]), + .axi_rsp_o (axi_tile_rsp[TileBootROM]), + .reg_req_o (bootrom_reg_req ), + .reg_rsp_i (bootrom_reg_rsp ) + ); + + bootrom i_bootrom ( + .clk_i (clk_i ), + .req_i (bootrom_reg_req.valid ), + .addr_i (addr_t'(bootrom_reg_req.addr)), + .rdata_o(bootrom_reg_rsp.rdata ) + ); + `FF(bootrom_reg_rsp.ready, bootrom_reg_req.valid, 1'b0) + assign bootrom_reg_rsp.error = 1'b0; + +endmodule diff --git a/hardware/src/cachepool_cluster_simple.sv b/hardware/src/cachepool_cluster_simple.sv new file mode 100644 index 0000000..ff963d5 --- /dev/null +++ b/hardware/src/cachepool_cluster_simple.sv @@ -0,0 +1,1570 @@ +// Copyright 2025 ETH Zurich and University of Bologna. +// Solderpad Hardware License, Version 0.51, see LICENSE for details. +// SPDX-License-Identifier: SHL-0.51 + +// Author: Diyou Shen + +`include "axi/assign.svh" +`include "axi/typedef.svh" +`include "common_cells/assertions.svh" +`include "common_cells/registers.svh" +`include "mem_interface/assign.svh" +`include "mem_interface/typedef.svh" +`include "register_interface//assign.svh" +`include "register_interface/typedef.svh" +`include "reqrsp_interface/assign.svh" +`include "reqrsp_interface/typedef.svh" +`include "snitch_vm/typedef.svh" +`include "tcdm_interface/assign.svh" +`include "tcdm_interface/typedef.svh" + +/// A single-tile cluster implementation for CachePool +module cachepool_cluster_simple + import cachepool_pkg::*; + import spatz_pkg::*; + import fpnew_pkg::fpu_implementation_t; + import snitch_pma_pkg::snitch_pma_t; + #( + /// Width of physical address. + parameter int unsigned AxiAddrWidth = 48, + /// Width of AXI port. + parameter int unsigned AxiDataWidth = 512, + /// AXI: id width in. + parameter int unsigned AxiIdWidthIn = 2, + /// AXI: id width out. + parameter int unsigned AxiIdWidthOut = 2, + /// AXI: user width. + parameter int unsigned AxiUserWidth = 1, + /// Address from which to fetch the first instructions. + parameter logic [31:0] BootAddr = 32'h0, + /// Address to indicate start of L2 + parameter logic [AxiAddrWidth-1:0] L2Addr = 48'h0, + parameter logic [AxiAddrWidth-1:0] L2Size = 48'h0, + /// The total amount of cores. + parameter int unsigned NrCores = 8, + /// Data/TCDM memory depth per cut (in words). + parameter int unsigned TCDMDepth = 1024, + /// Cluster peripheral address region size (in kB). + parameter int unsigned ClusterPeriphSize = 64, + /// Number of TCDM Banks. + parameter int unsigned NrBanks = 2 * NrCores, + /// Size of DMA AXI buffer. + parameter int unsigned DMAAxiReqFifoDepth = 3, + /// Size of DMA request fifo. + parameter int unsigned DMAReqFifoDepth = 3, + /// Width of a single icache line. + parameter unsigned ICacheLineWidth = 0, + /// Number of icache lines per set. + parameter int unsigned ICacheLineCount = 0, + /// Number of icache sets. + parameter int unsigned ICacheSets = 0, + // PMA Configuration + parameter snitch_pma_t SnitchPMACfg = '{default: 0}, + /// # Core-global parameters + /// FPU configuration. + parameter fpu_implementation_t FPUImplementation [NrCores] = '{default: fpu_implementation_t'(0)}, + /// Spatz FPU/IPU Configuration + parameter int unsigned NumSpatzFPUs = 4, + parameter int unsigned NumSpatzIPUs = 1, + /// Per-core enabling of the custom `Xdma` ISA extensions. + parameter bit [NrCores-1:0] Xdma = '{default: '0}, + /// # Per-core parameters + /// Per-core integer outstanding loads + parameter int unsigned NumIntOutstandingLoads [NrCores] = '{default: '0}, + /// Per-core integer outstanding memory operations (load and stores) + parameter int unsigned NumIntOutstandingMem [NrCores] = '{default: '0}, + /// Per-core Spatz outstanding loads + parameter int unsigned NumSpatzOutstandingLoads [NrCores] = '{default: '0}, + /// ## Timing Tuning Parameters + /// Insert Pipeline registers into off-loading path (response) + parameter bit RegisterOffloadRsp = 1'b0, + /// Insert Pipeline registers into data memory path (request) + parameter bit RegisterCoreReq = 1'b0, + /// Insert Pipeline registers into data memory path (response) + parameter bit RegisterCoreRsp = 1'b0, + /// Insert Pipeline registers after each memory cut + parameter bit RegisterTCDMCuts = 1'b0, + /// Decouple external AXI plug + parameter bit RegisterExt = 1'b0, + parameter axi_pkg::xbar_latency_e XbarLatency = axi_pkg::CUT_ALL_PORTS, + /// Outstanding transactions on the AXI network + parameter int unsigned MaxMstTrans = 4, + parameter int unsigned MaxSlvTrans = 4, + /// # Interface + /// AXI Ports + parameter type axi_in_req_t = logic, + parameter type axi_in_resp_t = logic, + parameter type axi_out_req_t = logic, + parameter type axi_out_resp_t = logic, + /// SRAM configuration + parameter type impl_in_t = logic, + // Memory latency parameter. Most of the memories have a read latency of 1. In + // case you have memory macros which are pipelined you want to adjust this + // value here. This only applies to the TCDM. The instruction cache macros will break! + // In case you are using the `RegisterTCDMCuts` feature this adds an + // additional cycle latency, which is taken into account here. + parameter int unsigned MemoryMacroLatency = 1 + RegisterTCDMCuts, + /// # SRAM Configuration rules needed: L1D Tag + L1D Data + L1D FIFO + L1I Tag + L1I Data + /*** ATTENTION: `NrSramCfg` should be changed if `L1NumDataBank` and `L1NumTagBank` is changed ***/ + parameter int unsigned NrSramCfg = 1 + ) ( + /// System clock. + input logic clk_i, + /// Asynchronous active high reset. This signal is assumed to be _async_. + input logic rst_ni, + /// Per-core debug request signal. Asserting this signals puts the + /// corresponding core into debug mode. This signal is assumed to be _async_. + input logic [NrCores-1:0] debug_req_i, + /// Machine external interrupt pending. Usually those interrupts come from a + /// platform-level interrupt controller. This signal is assumed to be _async_. + input logic [NrCores-1:0] meip_i, + /// Machine timer interrupt pending. Usually those interrupts come from a + /// core-local interrupt controller such as a timer/RTC. This signal is + /// assumed to be _async_. + input logic [NrCores-1:0] mtip_i, + /// Core software interrupt pending. Usually those interrupts come from + /// another core to facilitate inter-processor-interrupts. This signal is + /// assumed to be _async_. + input logic [NrCores-1:0] msip_i, + /// First hartid of the cluster. Cores of a cluster are monotonically + /// increasing without a gap, i.e., a cluster with 8 cores and a + /// `hart_base_id_i` of 5 get the hartids 5 - 12. + input logic [9:0] hart_base_id_i, + /// Base address of cluster. TCDM and cluster peripheral location are derived from + /// it. This signal is pseudo-static. + input logic [AxiAddrWidth-1:0] cluster_base_addr_i, + /// Per-cluster probe on the cluster status. Can be written by the cores to indicate + /// to the overall system that the cluster is executing something. + output logic cluster_probe_o, + /// AXI Core cluster in-port. + input axi_in_req_t axi_in_req_i, + output axi_in_resp_t axi_in_resp_o, + /// AXI Core cluster out-port to core. + output axi_out_req_t axi_out_req_o, + input axi_out_resp_t axi_out_resp_i, + /// AXI Core cluster out-port to L2 Mem. + output axi_out_req_t axi_out_l2_req_o, + input axi_out_resp_t axi_out_l2_resp_i, + /// SRAM Configuration: L1D Data + L1D Tag + L1D FIFO + L1I Data + L1I Tag + input impl_in_t [NrSramCfg-1:0] impl_i, + /// Indicate the program execution is error + output logic error_o + ); + // --------- + // Imports + // --------- + import snitch_pkg::*; + import snitch_icache_pkg::icache_events_t; + + // --------- + // Constants + // --------- + /// Minimum width to hold the core number. + localparam int unsigned CoreIDWidth = cf_math_pkg::idx_width(NrCores); + localparam int unsigned TCDMMemAddrWidth = $clog2(TCDMDepth); + localparam int unsigned TCDMSize = NrBanks * TCDMDepth * BeWidth; + // The short address for SPM + localparam int unsigned SPMAddrWidth = $clog2(TCDMSize); + // Enlarge the address width for Spatz due to cache + localparam int unsigned TCDMAddrWidth = 32; + localparam int unsigned BanksPerSuperBank = AxiDataWidth / DataWidth; + localparam int unsigned NrSuperBanks = NrBanks / BanksPerSuperBank; + + function automatic int unsigned get_tcdm_ports(int unsigned core); + return spatz_pkg::N_FU + 1; + endfunction + + function automatic int unsigned get_tcdm_port_offs(int unsigned core_idx); + automatic int n = 0; + for (int i = 0; i < core_idx; i++) n += get_tcdm_ports(i); + return n; + endfunction + + localparam int unsigned NrTCDMPortsPerCore = get_tcdm_ports(0); + localparam int unsigned NrTCDMPortsCores = get_tcdm_port_offs(NrCores); + localparam int unsigned NumTCDMIn = NrTCDMPortsCores + 1; + localparam logic [AxiAddrWidth-1:0] TCDMMask = ~(TCDMSize-1); + + // Core Request, SoC Request + localparam int unsigned NrNarrowMasters = 2; + + // Narrow AXI network parameters + localparam int unsigned NarrowIdWidthIn = AxiIdWidthIn; + localparam int unsigned NarrowIdWidthOut = NarrowIdWidthIn + $clog2(NrNarrowMasters); + localparam int unsigned NarrowDataWidth = ELEN; + localparam int unsigned NarrowUserWidth = AxiUserWidth; + + // TCDM, Peripherals, SoC Request + localparam int unsigned NrNarrowSlaves = 3; + localparam int unsigned NrNarrowRules = NrNarrowSlaves - 1; + + // Core Request, DMA, Instruction cache + /// Additional one for L1 DCache + localparam int unsigned NrWideMasters = 3 + 1; + localparam int unsigned WideIdWidthOut = AxiIdWidthOut; + localparam int unsigned WideIdWidthIn = WideIdWidthOut - $clog2(NrWideMasters); + // DMA X-BAR configuration + localparam int unsigned NrWideSlaves = 3 + 1; // one prot for L2, one for L3/LLC (virtual) + + // AXI Configuration + localparam axi_pkg::xbar_cfg_t ClusterXbarCfg = '{ + NoSlvPorts : NrNarrowMasters, + NoMstPorts : NrNarrowSlaves, + MaxMstTrans : MaxMstTrans, + MaxSlvTrans : MaxSlvTrans, + FallThrough : 1'b0, + LatencyMode : XbarLatency, + AxiIdWidthSlvPorts: NarrowIdWidthIn, + AxiIdUsedSlvPorts : NarrowIdWidthIn, + UniqueIds : 1'b0, + AxiAddrWidth : AxiAddrWidth, + AxiDataWidth : NarrowDataWidth, + NoAddrRules : NrNarrowRules, + default : '0 + }; + + // DMA configuration struct + localparam axi_pkg::xbar_cfg_t DmaXbarCfg = '{ + NoSlvPorts : NrWideMasters, + NoMstPorts : NrWideSlaves, + MaxMstTrans : MaxMstTrans, + MaxSlvTrans : MaxSlvTrans, + FallThrough : 1'b0, + LatencyMode : XbarLatency, + AxiIdWidthSlvPorts: WideIdWidthIn, + AxiIdUsedSlvPorts : WideIdWidthIn, + UniqueIds : 1'b0, + AxiAddrWidth : AxiAddrWidth, + AxiDataWidth : AxiDataWidth, + NoAddrRules : NrWideSlaves - 1, + default : '0 + }; + + // -------- + // Typedefs + // -------- + typedef logic [AxiAddrWidth-1:0] addr_t; + typedef logic [NarrowDataWidth-1:0] data_t; + typedef logic [63:0] tag_data_t; + typedef logic [NarrowDataWidth/8-1:0] strb_t; + typedef logic [AxiDataWidth-1:0] data_dma_t; + typedef logic [AxiDataWidth/8-1:0] strb_dma_t; + typedef logic [NarrowIdWidthIn-1:0] id_mst_t; + typedef logic [NarrowIdWidthOut-1:0] id_slv_t; + typedef logic [WideIdWidthIn-1:0] id_dma_mst_t; + typedef logic [WideIdWidthOut-1:0] id_dma_slv_t; + typedef logic [WideIdWidthIn-$clog2(NumL1CacheCtrl)-1:0] id_dcache_mst_t; + typedef logic [NarrowUserWidth-1:0] user_t; + typedef logic [AxiUserWidth-1:0] user_dma_t; + + typedef logic [TCDMMemAddrWidth-1:0] tcdm_mem_addr_t; + typedef logic [TCDMAddrWidth-1:0] tcdm_addr_t; + typedef logic [SPMAddrWidth-1:0] spm_addr_t; + + typedef logic [$clog2(NumSpatzOutstandingLoads[0])-1:0] reqid_t; + + typedef logic [$clog2(L1NumSet)-1:0] tcdm_bank_addr_t; + + typedef struct packed { + logic [CoreIDWidth-1:0] core_id; + logic is_core; + logic is_amo; + reqid_t req_id; + } tcdm_user_t; + + // The metadata type used to restore the information from req to rsp + typedef struct packed { + tcdm_user_t user; + logic write; + } tcdm_meta_t; + + + // Regbus peripherals. + `AXI_TYPEDEF_ALL(axi_mst, addr_t, id_mst_t, data_t, strb_t, user_t) + `AXI_TYPEDEF_ALL(axi_slv, addr_t, id_slv_t, data_t, strb_t, user_t) + `AXI_TYPEDEF_ALL(axi_mst_dma, addr_t, id_dma_mst_t, data_dma_t, strb_dma_t, user_dma_t) + `AXI_TYPEDEF_ALL(axi_slv_dma, addr_t, id_dma_slv_t, data_dma_t, strb_dma_t, user_dma_t) + `AXI_TYPEDEF_ALL(axi_dcache, addr_t, id_dcache_mst_t, data_dma_t, strb_dma_t, user_dma_t) + + `REQRSP_TYPEDEF_ALL(reqrsp, addr_t, data_t, strb_t) + + `MEM_TYPEDEF_ALL(mem, tcdm_mem_addr_t, data_t, strb_t, tcdm_user_t) + `MEM_TYPEDEF_ALL(mem_dma, tcdm_mem_addr_t, data_dma_t, strb_dma_t, logic) + + `TCDM_TYPEDEF_ALL(tcdm, tcdm_addr_t, data_t, strb_t, tcdm_user_t) + `TCDM_TYPEDEF_ALL(tcdm_dma, tcdm_addr_t, data_dma_t, strb_dma_t, logic) + `TCDM_TYPEDEF_ALL(spm, spm_addr_t, data_t, strb_t, tcdm_user_t) + + `REG_BUS_TYPEDEF_ALL(reg, addr_t, data_t, strb_t) + `REG_BUS_TYPEDEF_ALL(reg_dma, addr_t, data_dma_t, strb_dma_t) + + // Event counter increments for the TCDM. + typedef struct packed { + /// Number requests going in + logic [$clog2(NrTCDMPortsCores):0] inc_accessed; + /// Number of requests stalled due to congestion + logic [$clog2(NrTCDMPortsCores):0] inc_congested; + } tcdm_events_t; + + // Event counter increments for DMA. + typedef struct packed { + logic aw_stall, ar_stall, r_stall, w_stall, + buf_w_stall, buf_r_stall; + logic aw_valid, aw_ready, aw_done, aw_bw; + logic ar_valid, ar_ready, ar_done, ar_bw; + logic r_valid, r_ready, r_done, r_bw; + logic w_valid, w_ready, w_done, w_bw; + logic b_valid, b_ready, b_done; + logic dma_busy; + axi_pkg::len_t aw_len, ar_len; + axi_pkg::size_t aw_size, ar_size; + logic [$clog2(AxiDataWidth/8):0] num_bytes_written; + } dma_events_t; + + typedef struct packed { + int unsigned idx; + addr_t start_addr; + addr_t end_addr; + } xbar_rule_t; + + typedef struct packed { + acc_addr_e addr; + logic [5:0] id; + logic [31:0] data_op; + data_t data_arga; + data_t data_argb; + addr_t data_argc; + } acc_issue_req_t; + + typedef struct packed { + logic accept; + logic writeback; + logic loadstore; + logic exception; + logic isfloat; + } acc_issue_rsp_t; + + typedef struct packed { + logic [5:0] id; + logic error; + data_t data; + } acc_rsp_t; + + `SNITCH_VM_TYPEDEF(AxiAddrWidth) + + typedef struct packed { + // Slow domain. + logic flush_i_valid; + addr_t inst_addr; + logic inst_cacheable; + logic inst_valid; + // Fast domain. + acc_issue_req_t acc_req; + logic acc_qvalid; + logic acc_pready; + // Slow domain. + logic [1:0] ptw_valid; + va_t [1:0] ptw_va; + pa_t [1:0] ptw_ppn; + } hive_req_t; + + typedef struct packed { + // Slow domain. + logic flush_i_ready; + logic [31:0] inst_data; + logic inst_ready; + logic inst_error; + // Fast domain. + logic acc_qready; + acc_rsp_t acc_resp; + logic acc_pvalid; + // Slow domain. + logic [1:0] ptw_ready; + l0_pte_t [1:0] ptw_pte; + logic [1:0] ptw_is_4mega; + } hive_rsp_t; + + // ----------- + // Assignments + // ----------- + // Calculate start and end address of TCDM based on the `cluster_base_addr_i`. + addr_t tcdm_start_address, tcdm_end_address; + assign tcdm_start_address = (cluster_base_addr_i & TCDMMask); + assign tcdm_end_address = (tcdm_start_address + TCDMSize) & TCDMMask; + + addr_t cluster_periph_start_address, cluster_periph_end_address; + assign cluster_periph_start_address = tcdm_end_address; + assign cluster_periph_end_address = tcdm_end_address + ClusterPeriphSize * 1024; + + localparam int unsigned ClusterReserve = 4096; // 4 MiB + localparam int unsigned ClusterL2Size = 8192; // 8 MiB + addr_t cluster_l2_start_address, cluster_l2_end_address; + assign cluster_l2_start_address = L2Addr; + assign cluster_l2_end_address = L2Addr + L2Size; + + // ---------------- + // Wire Definitions + // ---------------- + // 1. AXI + axi_slv_req_t [NrNarrowSlaves-1:0] narrow_axi_slv_req; + axi_slv_resp_t [NrNarrowSlaves-1:0] narrow_axi_slv_rsp; + axi_mst_req_t [NrNarrowMasters-1:0] narrow_axi_mst_req; + axi_mst_resp_t [NrNarrowMasters-1:0] narrow_axi_mst_rsp; + + // DMA AXI buses + axi_mst_dma_req_t [NrWideMasters-1:0] wide_axi_mst_req; + axi_mst_dma_resp_t [NrWideMasters-1:0] wide_axi_mst_rsp; + axi_slv_dma_req_t [NrWideSlaves-1 :0] wide_axi_slv_req; + axi_slv_dma_resp_t [NrWideSlaves-1 :0] wide_axi_slv_rsp; + + // AXI req/rsp from/to cache controllers + axi_dcache_req_t [NumL1CacheCtrl-1:0] dcache_axi_req; + axi_dcache_resp_t [NumL1CacheCtrl-1:0] dcache_axi_rsp; + + // 2. Memory Subsystem (Banks) + mem_req_t [NrSuperBanks-1:0][BanksPerSuperBank-1:0] ic_req; + mem_rsp_t [NrSuperBanks-1:0][BanksPerSuperBank-1:0] ic_rsp; + + mem_dma_req_t [NrSuperBanks-1:0] sb_dma_req; + mem_dma_rsp_t [NrSuperBanks-1:0] sb_dma_rsp; + + // 3. Memory Subsystem (Interconnect) + tcdm_dma_req_t ext_dma_req; + tcdm_dma_rsp_t ext_dma_rsp; + + // AXI Ports into TCDM (from SoC). + spm_req_t axi_soc_req; + spm_rsp_t axi_soc_rsp; + + tcdm_req_t [NrTCDMPortsCores-1:0] tcdm_req; + tcdm_rsp_t [NrTCDMPortsCores-1:0] tcdm_rsp; + + core_events_t [NrCores-1:0] core_events; + tcdm_events_t tcdm_events; + dma_events_t dma_events; + snitch_icache_pkg::icache_events_t [NrCores-1:0] icache_events; + + // 4. Memory Subsystem (Core side). + reqrsp_req_t [NrCores-1:0] core_req, filtered_core_req; + reqrsp_rsp_t [NrCores-1:0] core_rsp, filtered_core_rsp; + + // 5. Peripheral Subsystem + reg_req_t reg_req; + reg_rsp_t reg_rsp; + + // 6. BootROM + reg_dma_req_t bootrom_reg_req; + reg_dma_rsp_t bootrom_reg_rsp; + + // 7. Misc. Wires. + logic icache_prefetch_enable; + logic [NrCores-1:0] cl_interrupt; + + // 8. L1 D$ + spm_req_t [NrTCDMPortsCores-1:0] spm_req; + spm_rsp_t [NrTCDMPortsCores-1:0] spm_rsp; + + tcdm_req_t [NrTCDMPortsCores-1:0] unmerge_req; + tcdm_rsp_t [NrTCDMPortsCores-1:0] unmerge_rsp; + + tcdm_req_t [NrTCDMPortsPerCore-1:0][NumL1CacheCtrl-1:0] cache_req, cache_xbar_req, cache_amo_req; + tcdm_rsp_t [NrTCDMPortsPerCore-1:0][NumL1CacheCtrl-1:0] cache_rsp, cache_xbar_rsp, cache_amo_rsp; + + logic [NumL1CacheCtrl-1:0][NrTCDMPortsPerCore-1:0] cache_req_valid; + logic [NumL1CacheCtrl-1:0][NrTCDMPortsPerCore-1:0] cache_req_ready; + tcdm_addr_t [NumL1CacheCtrl-1:0][NrTCDMPortsPerCore-1:0] cache_req_addr; + tcdm_user_t [NumL1CacheCtrl-1:0][NrTCDMPortsPerCore-1:0] cache_req_meta; + logic [NumL1CacheCtrl-1:0][NrTCDMPortsPerCore-1:0] cache_req_write; + data_t [NumL1CacheCtrl-1:0][NrTCDMPortsPerCore-1:0] cache_req_data; + + logic [NumL1CacheCtrl-1:0][NrTCDMPortsPerCore-1:0] cache_rsp_valid; + logic [NumL1CacheCtrl-1:0][NrTCDMPortsPerCore-1:0] cache_rsp_ready; + logic [NumL1CacheCtrl-1:0][NrTCDMPortsPerCore-1:0] cache_rsp_write; + data_t [NumL1CacheCtrl-1:0][NrTCDMPortsPerCore-1:0] cache_rsp_data; + tcdm_user_t [NumL1CacheCtrl-1:0][NrTCDMPortsPerCore-1:0] cache_rsp_meta; + + logic [NumL1CacheCtrl-1:0][NumTagBankPerCtrl-1:0] l1_tag_bank_req; + logic [NumL1CacheCtrl-1:0][NumTagBankPerCtrl-1:0] l1_tag_bank_we; + tcdm_bank_addr_t [NumL1CacheCtrl-1:0][NumTagBankPerCtrl-1:0] l1_tag_bank_addr; + tag_data_t [NumL1CacheCtrl-1:0][NumTagBankPerCtrl-1:0] l1_tag_bank_wdata; + logic [NumL1CacheCtrl-1:0][NumTagBankPerCtrl-1:0] l1_tag_bank_be; + tag_data_t [NumL1CacheCtrl-1:0][NumTagBankPerCtrl-1:0] l1_tag_bank_rdata; + + logic [NumL1CacheCtrl-1:0][NumDataBankPerCtrl-1:0] l1_data_bank_req; + logic [NumL1CacheCtrl-1:0][NumDataBankPerCtrl-1:0] l1_data_bank_we; + tcdm_bank_addr_t [NumL1CacheCtrl-1:0][NumDataBankPerCtrl-1:0] l1_data_bank_addr; + data_t [NumL1CacheCtrl-1:0][NumDataBankPerCtrl-1:0] l1_data_bank_wdata; + logic [NumL1CacheCtrl-1:0][NumDataBankPerCtrl-1:0] l1_data_bank_be; + data_t [NumL1CacheCtrl-1:0][NumDataBankPerCtrl-1:0] l1_data_bank_rdata; + logic [NumL1CacheCtrl-1:0][NumDataBankPerCtrl-1:0] l1_data_bank_gnt; + + logic l1d_insn_valid; + logic [NumL1CacheCtrl-1:0] l1d_insn_ready; + logic [1:0] l1d_insn; + tcdm_bank_addr_t cfg_spm_size; + logic l1d_busy; + + // High if a port access an illegal SPM region (mapped to cache) + logic [NrTCDMPortsCores-1:0] spm_error; + + + // 9. SRAM Configuration + // impl_in_t [L1NumWrapper-1:0][L1BankPerWP-1:0] impl_l1d_data; + // impl_in_t [L1NumTagBank-1:0] impl_l1d_tag; + // impl_in_t [1:0] impl_l1d_fifo; + + // impl_in_t [ICacheSets-1:0] impl_l1i_data; + // impl_in_t [ICacheSets-1:0] impl_l1i_tag; + + // assign {impl_l1d_data, impl_l1d_tag, impl_l1d_fifo, impl_l1i_data, impl_l1i_tag} = impl_i; + assign error_o = |spm_error; + + + // ------------- + // DMA Subsystem + // ------------- + // Optionally decouple the external wide AXI master port. + axi_cut #( + .Bypass (!RegisterExt ), + .aw_chan_t (axi_slv_dma_aw_chan_t), + .w_chan_t (axi_slv_dma_w_chan_t ), + .b_chan_t (axi_slv_dma_b_chan_t ), + .ar_chan_t (axi_slv_dma_ar_chan_t), + .r_chan_t (axi_slv_dma_r_chan_t ), + .axi_req_t (axi_slv_dma_req_t ), + .axi_resp_t (axi_slv_dma_resp_t ) + ) i_cut_ext_wide_out ( + .clk_i (clk_i ), + .rst_ni (rst_ni ), + .slv_req_i (wide_axi_slv_req[SoCDMAOut]), + .slv_resp_o (wide_axi_slv_rsp[SoCDMAOut]), + .mst_req_o (axi_out_req_o ), + .mst_resp_i (axi_out_resp_i ) + ); + + axi_cut #( + .Bypass (!RegisterExt ), + .aw_chan_t (axi_slv_dma_aw_chan_t), + .w_chan_t (axi_slv_dma_w_chan_t ), + .b_chan_t (axi_slv_dma_b_chan_t ), + .ar_chan_t (axi_slv_dma_ar_chan_t), + .r_chan_t (axi_slv_dma_r_chan_t ), + .axi_req_t (axi_slv_dma_req_t ), + .axi_resp_t (axi_slv_dma_resp_t ) + ) i_cut_ext_l2_wide_out ( + .clk_i (clk_i ), + .rst_ni (rst_ni ), + .slv_req_i (wide_axi_slv_req[L2Mem]), + .slv_resp_o (wide_axi_slv_rsp[L2Mem]), + .mst_req_o (axi_out_l2_req_o ), + .mst_resp_i (axi_out_l2_resp_i ) + ); + + axi_cut #( + .Bypass (!RegisterExt ), + .aw_chan_t (axi_mst_aw_chan_t), + .w_chan_t (axi_mst_w_chan_t ), + .b_chan_t (axi_mst_b_chan_t ), + .ar_chan_t (axi_mst_ar_chan_t), + .r_chan_t (axi_mst_r_chan_t ), + .axi_req_t (axi_mst_req_t ), + .axi_resp_t (axi_mst_resp_t ) + ) i_cut_ext_narrow_in ( + .clk_i (clk_i ), + .rst_ni (rst_ni ), + .slv_req_i (axi_in_req_i ), + .slv_resp_o (axi_in_resp_o ), + .mst_req_o (narrow_axi_mst_req[SoCDMAIn]), + .mst_resp_i (narrow_axi_mst_rsp[SoCDMAIn]) + ); + + logic [DmaXbarCfg.NoSlvPorts-1:0][$clog2(DmaXbarCfg.NoMstPorts)-1:0] dma_xbar_default_port; + xbar_rule_t [DmaXbarCfg.NoAddrRules-1:0] dma_xbar_rule; + + assign dma_xbar_default_port = '{default: SoCDMAOut}; + assign dma_xbar_rule = '{ + '{ + idx : TCDMDMA, + start_addr: tcdm_start_address, + end_addr : tcdm_end_address + }, + '{ + idx : BootROM, + start_addr: BootAddr, + end_addr : BootAddr + 'h1000 + }, + '{ + idx : L2Mem, + start_addr: cluster_l2_start_address, + end_addr : cluster_l2_end_address + } + }; + + localparam bit [DmaXbarCfg.NoSlvPorts-1:0] DMAEnableDefaultMstPort = '1; + axi_xbar #( + .Cfg (DmaXbarCfg ), + .ATOPs (0 ), + .slv_aw_chan_t (axi_mst_dma_aw_chan_t), + .mst_aw_chan_t (axi_slv_dma_aw_chan_t), + .w_chan_t (axi_mst_dma_w_chan_t ), + .slv_b_chan_t (axi_mst_dma_b_chan_t ), + .mst_b_chan_t (axi_slv_dma_b_chan_t ), + .slv_ar_chan_t (axi_mst_dma_ar_chan_t), + .mst_ar_chan_t (axi_slv_dma_ar_chan_t), + .slv_r_chan_t (axi_mst_dma_r_chan_t ), + .mst_r_chan_t (axi_slv_dma_r_chan_t ), + .slv_req_t (axi_mst_dma_req_t ), + .slv_resp_t (axi_mst_dma_resp_t ), + .mst_req_t (axi_slv_dma_req_t ), + .mst_resp_t (axi_slv_dma_resp_t ), + .rule_t (xbar_rule_t ) + ) i_axi_dma_xbar ( + .clk_i (clk_i ), + .rst_ni (rst_ni ), + .test_i (1'b0 ), + .slv_ports_req_i (wide_axi_mst_req ), + .slv_ports_resp_o (wide_axi_mst_rsp ), + .mst_ports_req_o (wide_axi_slv_req ), + .mst_ports_resp_i (wide_axi_slv_rsp ), + .addr_map_i (dma_xbar_rule ), + .en_default_mst_port_i (DMAEnableDefaultMstPort), + .default_mst_port_i (dma_xbar_default_port ) + ); + + addr_t ext_dma_req_q_addr_nontrunc; + + axi_to_mem_interleaved #( + .axi_req_t (axi_slv_dma_req_t ), + .axi_resp_t (axi_slv_dma_resp_t ), + .AddrWidth (AxiAddrWidth ), + .DataWidth (AxiDataWidth ), + .IdWidth (WideIdWidthOut ), + .NumBanks (1 ), + .BufDepth (MemoryMacroLatency + 1) + ) i_axi_to_mem_dma ( + .clk_i (clk_i ), + .rst_ni (rst_ni ), + .busy_o (/* Unused */ ), + .test_i (1'b0 ), + .axi_req_i (wide_axi_slv_req[TCDMDMA] ), + .axi_resp_o (wide_axi_slv_rsp[TCDMDMA] ), + .mem_req_o (ext_dma_req.q_valid ), + .mem_gnt_i (ext_dma_rsp.q_ready ), + .mem_addr_o (ext_dma_req_q_addr_nontrunc ), + .mem_wdata_o (ext_dma_req.q.data ), + .mem_strb_o (ext_dma_req.q.strb ), + .mem_atop_o (/* The DMA does not support atomics */), + .mem_we_o (ext_dma_req.q.write ), + .mem_rvalid_i (ext_dma_rsp.p_valid ), + .mem_rdata_i (ext_dma_rsp.p.data ) + ); + + assign ext_dma_req.q.addr = tcdm_addr_t'(ext_dma_req_q_addr_nontrunc); + assign ext_dma_req.q.amo = reqrsp_pkg::AMONone; + assign ext_dma_req.q.user = '0; + + spatz_tcdm_interconnect #( + .NumInp (1 ), + .NumOut (NrSuperBanks ), + .tcdm_req_t (tcdm_dma_req_t ), + .tcdm_rsp_t (tcdm_dma_rsp_t ), + .mem_req_t (mem_dma_req_t ), + .mem_rsp_t (mem_dma_rsp_t ), + .user_t (logic ), + .MemAddrWidth (TCDMMemAddrWidth ), + .DataWidth (AxiDataWidth ), + .MemoryResponseLatency (MemoryMacroLatency) + ) i_dma_interconnect ( + .clk_i (clk_i ), + .rst_ni (rst_ni ), + .req_i (ext_dma_req), + .rsp_o (ext_dma_rsp), + .mem_req_o (sb_dma_req ), + .mem_rsp_i (sb_dma_rsp ) + ); + + // ---------------- + // Memory Subsystem + // ---------------- + for (genvar i = 0; i < NrSuperBanks; i++) begin : gen_tcdm_super_bank + + mem_req_t [BanksPerSuperBank-1:0] amo_req; + mem_rsp_t [BanksPerSuperBank-1:0] amo_rsp; + + logic [BanksPerSuperBank-1:0] mem_cs, mem_wen; + tcdm_mem_addr_t [BanksPerSuperBank-1:0] mem_add; + tcdm_mem_addr_t [BanksPerSuperBank-1:0] mem_add_max; + strb_t [BanksPerSuperBank-1:0] mem_be; + data_t [BanksPerSuperBank-1:0] mem_rdata, mem_wdata; + tcdm_meta_t [BanksPerSuperBank-1:0] bank_req_meta, mem_req_meta, bank_rsp_meta; + + mem_wide_narrow_mux #( + .NarrowDataWidth (NarrowDataWidth), + .WideDataWidth (AxiDataWidth ), + .mem_narrow_req_t (mem_req_t ), + .mem_narrow_rsp_t (mem_rsp_t ), + .mem_wide_req_t (mem_dma_req_t ), + .mem_wide_rsp_t (mem_dma_rsp_t ) + ) i_tcdm_mux ( + .clk_i (clk_i ), + .rst_ni (rst_ni ), + .in_narrow_req_i (ic_req [i] ), + .in_narrow_rsp_o (ic_rsp [i] ), + .in_wide_req_i (sb_dma_req [i] ), + .in_wide_rsp_o (sb_dma_rsp [i] ), + .out_req_o (amo_req ), + .out_rsp_i (amo_rsp ), + .sel_wide_i (sb_dma_req[i].q_valid) + ); + + // generate banks of the superbank + for (genvar j = 0; j < BanksPerSuperBank; j++) begin : gen_tcdm_bank + tc_sram_impl #( + .NumWords (TCDMDepth), + .DataWidth (DataWidth), + .ByteWidth (8 ), + .NumPorts (1 ), + .Latency (1 ) + ) i_spm_mem ( + .clk_i (clk_i ), + .rst_ni (rst_ni ), + .impl_i ('0 ), + .impl_o (/* Unused */), + .req_i (mem_cs[j] ), + .we_i (mem_wen[j] ), + .addr_i (mem_add[j] ), + .wdata_i (mem_wdata[j] ), + .be_i (mem_be[j] ), + .rdata_o (mem_rdata[j] ) + ); + + data_t amo_rdata_local; + + // TODO(zarubaf): Share atomic units between mutltiple cuts + snitch_amo_shim #( + .AddrMemWidth ( TCDMMemAddrWidth ), + .DataWidth ( DataWidth ), + .CoreIDWidth ( CoreIDWidth ) + ) i_amo_shim ( + .clk_i (clk_i ), + .rst_ni (rst_ni ), + .valid_i (amo_req[j].q_valid ), + .ready_o (amo_rsp[j].q_ready ), + .addr_i (amo_req[j].q.addr ), + .write_i (amo_req[j].q.write ), + .wdata_i (amo_req[j].q.data ), + .wstrb_i (amo_req[j].q.strb ), + .core_id_i (amo_req[j].q.user.core_id ), + .is_core_i (amo_req[j].q.user.is_core ), + .rdata_o (amo_rdata_local ), + .amo_i (amo_req[j].q.amo ), + .mem_req_o (mem_cs[j] ), + .mem_add_o (mem_add[j] ), + .mem_wen_o (mem_wen[j] ), + .mem_wdata_o (mem_wdata[j] ), + .mem_be_o (mem_be[j] ), + .mem_rdata_i (mem_rdata[j] ), + .dma_access_i (sb_dma_req[i].q_valid ), + // TODO(zarubaf): Signal AMO conflict somewhere. Socregs? + .amo_conflict_o (/* Unused */ ) + ); + + // Insert a pipeline register at the output of each SRAM. + shift_reg #( + .dtype(data_t ), + .Depth(int'(RegisterTCDMCuts)) + ) i_sram_pipe ( + .clk_i (clk_i ), + .rst_ni(rst_ni ), + .d_i (amo_rdata_local ), + .d_o (amo_rsp[j].p.data) + ); + + // the meta data information + assign bank_req_meta[j] = '{ + user: amo_req[j].q.user, + write: amo_req[j].q.write, + default: '0 + }; + assign amo_rsp[j].p.user = bank_rsp_meta[j].user; + assign amo_rsp[j].p.write = bank_rsp_meta[j].write; + + shift_reg #( + .dtype(tcdm_meta_t ), + .Depth(int'(RegisterTCDMCuts)) + ) i_req_meta_pipe ( + .clk_i (clk_i ), + .rst_ni(rst_ni ), + .d_i (bank_req_meta[j] ), + .d_o (mem_req_meta[j] ) + ); + shift_reg #( + .dtype(tcdm_meta_t ), + .Depth(int'(RegisterTCDMCuts)) + ) i_rsp_meta_pipe ( + .clk_i (clk_i ), + .rst_ni(rst_ni ), + .d_i (mem_req_meta[j] ), + .d_o (bank_rsp_meta[j] ) + ); + end + end + + logic [NrTCDMPortsCores-1:0] unmerge_pready; + logic [NrTCDMPortsPerCore-1:0][NumL1CacheCtrl-1:0] cache_pready, cache_xbar_pready, cache_amo_pready; + + // split the requests for spm or cache from core side + spatz_addr_mapper #( + .NumIO (NrTCDMPortsCores ), + .AddrWidth (L1AddrWidth ), + .SPMAddrWidth (SPMAddrWidth ), + .DataWidth (DataWidth ), + .mem_req_t (tcdm_req_t ), + .mem_rsp_t (tcdm_rsp_t ), + .mem_rsp_chan_t (tcdm_rsp_chan_t ), + .spm_req_t (spm_req_t ), + .spm_rsp_t (spm_rsp_t ) + ) i_tcdm_mapper ( + .clk_i (clk_i ), + .rst_ni (rst_ni ), + // Input + .mem_req_i (tcdm_req ), + .mem_rsp_o (tcdm_rsp ), + .error_o (spm_error ), + // Address + .tcdm_start_address_i (tcdm_start_address[L1AddrWidth-1:0] ), + .tcdm_end_address_i (tcdm_end_address[L1AddrWidth-1:0] ), + .spm_size_i (tcdm_end_address[L1AddrWidth-1:0] - tcdm_start_address[L1AddrWidth-1:0]), + .flush_i (l1d_busy ), + // Output + .spm_req_o (spm_req ), + .spm_rsp_i (spm_rsp ), + .cache_req_o (unmerge_req ), + .cache_pready_o (unmerge_pready ), + .cache_rsp_i (unmerge_rsp ) + ); + + for (genvar j = 0; j < NrTCDMPortsPerCore; j++) begin + for (genvar cb = 0; cb < NumL1CacheCtrl; cb++) begin + assign cache_req [j][cb] = unmerge_req [cb*NrTCDMPortsPerCore+j]; + assign cache_pready[j][cb] = unmerge_pready[cb*NrTCDMPortsPerCore+j]; + assign unmerge_rsp [cb*NrTCDMPortsPerCore+j] = cache_rsp [j][cb]; + end + end + + // Used to determine the mapping policy between different cache banks. + // Set through CSR + logic [$clog2(32)-1:0] dynamic_offset; + + /// Wire requests after strb handling to the cache controller + for (genvar j = 0; j < NrTCDMPortsPerCore; j++) begin : gen_cache_xbar + tcdm_cache_interco #( + .NumCore (NrCores ), + .NumCache (NumL1CacheCtrl ), + .AddrWidth (32'd32 ), + .tcdm_req_t (tcdm_req_t ), + .tcdm_rsp_t (tcdm_rsp_t ), + .tcdm_req_chan_t (tcdm_req_chan_t ), + .tcdm_rsp_chan_t (tcdm_rsp_chan_t ) + ) i_cache_xbar ( + .clk_i (clk_i ), + .rst_ni (rst_ni ), + .dynamic_offset_i (dynamic_offset ), + .core_req_i (cache_req [j] ), + .core_rsp_ready_i (cache_pready [j] ), + .core_rsp_o (cache_rsp [j] ), + .mem_req_o (cache_xbar_req [j] ), + .mem_rsp_ready_o (cache_xbar_pready[j] ), + .mem_rsp_i (cache_xbar_rsp [j] ) + ); + end + + for (genvar cb = 0; cb < NumL1CacheCtrl; cb++) begin : gen_cache_connect + for (genvar j = 0; j < NrTCDMPortsPerCore; j++) begin : gen_cache_amo + spatz_cache_amo #( + .DataWidth ( DataWidth ), + .CoreIDWidth ( CoreIDWidth ), + .tcdm_req_t ( tcdm_req_t ), + .tcdm_rsp_t ( tcdm_rsp_t ), + .tcdm_req_chan_t ( tcdm_req_chan_t ), + .tcdm_rsp_chan_t ( tcdm_rsp_chan_t ), + .tcdm_user_t ( tcdm_user_t ) + ) i_cache_amo ( + .clk_i (clk_i ), + .rst_ni (rst_ni ), + .core_req_i (cache_xbar_req [j][cb] ), + .core_rsp_ready_i (cache_xbar_pready[j][cb] ), + .core_rsp_o (cache_xbar_rsp [j][cb] ), + .mem_req_o (cache_amo_req [j][cb] ), + .mem_rsp_ready_o (cache_amo_pready [j][cb] ), + .mem_rsp_i (cache_amo_rsp [j][cb] ) + ); + assign cache_req_valid[cb][j] = cache_amo_req[j][cb].q_valid; + assign cache_req_addr [cb][j] = cache_amo_req[j][cb].q.addr; + assign cache_req_meta [cb][j] = cache_amo_req[j][cb].q.user; + assign cache_req_write[cb][j] = cache_amo_req[j][cb].q.write; + assign cache_req_data [cb][j] = cache_amo_req[j][cb].q.data; + + // assign cache_rsp_ready[cb][j] = 1'b1; + assign cache_rsp_ready[cb][j] = cache_amo_pready[j][cb]; + + assign cache_amo_rsp[j][cb].p_valid = cache_rsp_valid[cb][j]; + assign cache_amo_rsp[j][cb].q_ready = cache_req_ready[cb][j]; + assign cache_amo_rsp[j][cb].p.data = cache_rsp_data [cb][j]; + assign cache_amo_rsp[j][cb].p.user = cache_rsp_meta [cb][j]; + + assign cache_amo_rsp[j][cb].p.write = cache_rsp_write[cb][j]; + end + end + + // TODO: remove + tcdm_bank_addr_t num_spm_lines; + assign num_spm_lines = cfg_spm_size * (DepthPerBank / L1Size); + + for (genvar cb = 0; cb < NumL1CacheCtrl; cb++) begin: gen_l1_cache_ctrl + flamingo_spatz_cache_ctrl #( + // Core + .NumPorts (NrTCDMPortsPerCore ), + .CoalExtFactor (L1CoalFactor ), + .AddrWidth (L1AddrWidth ), + .WordWidth (DataWidth ), + // Cache + .NumCacheEntry (L1NumEntryPerCtrl ), + .CacheLineWidth (L1LineWidth ), + .SetAssociativity (L1AssoPerCtrl ), + .BankFactor (L1BankFactor ), + // Type + .core_meta_t (tcdm_user_t ), + .impl_in_t (impl_in_t ), + .axi_req_t (axi_dcache_req_t ), + .axi_resp_t (axi_dcache_resp_t ) + ) i_l1_controller ( + .clk_i (clk_i ), + .rst_ni (rst_ni ), + .impl_i ('0 ), + // Sync Control + .cache_sync_valid_i (l1d_insn_valid ), + .cache_sync_ready_o (l1d_insn_ready[cb] ), + .cache_sync_insn_i (l1d_insn ), + // SPM Size + // The calculation of spm region in cache is different + // than other modules (needs to times 2) + // Currently assume full cache + .bank_depth_for_SPM_i ('0 ), + // Request + .core_req_valid_i (cache_req_valid[cb] ), + .core_req_ready_o (cache_req_ready[cb] ), + .core_req_addr_i (cache_req_addr[cb] ), + .core_req_meta_i (cache_req_meta[cb] ), + .core_req_write_i (cache_req_write[cb] ), + .core_req_wdata_i (cache_req_data[cb] ), + // Response + .core_resp_valid_o (cache_rsp_valid[cb] ), + .core_resp_ready_i (cache_rsp_ready[cb] ), + .core_resp_write_o (cache_rsp_write[cb] ), + .core_resp_data_o (cache_rsp_data[cb] ), + .core_resp_meta_o (cache_rsp_meta[cb] ), + // AXI refill + .axi_req_o (dcache_axi_req[cb] ), + .axi_resp_i (dcache_axi_rsp[cb] ), + // Tag Banks + .tcdm_tag_bank_req_o (l1_tag_bank_req[cb] ), + .tcdm_tag_bank_we_o (l1_tag_bank_we[cb] ), + .tcdm_tag_bank_addr_o (l1_tag_bank_addr[cb] ), + .tcdm_tag_bank_wdata_o (l1_tag_bank_wdata[cb] ), + .tcdm_tag_bank_be_o (l1_tag_bank_be[cb] ), + .tcdm_tag_bank_rdata_i (l1_tag_bank_rdata[cb] ), + // Data Banks + .tcdm_data_bank_req_o (l1_data_bank_req[cb] ), + .tcdm_data_bank_we_o (l1_data_bank_we[cb] ), + .tcdm_data_bank_addr_o (l1_data_bank_addr[cb] ), + .tcdm_data_bank_wdata_o(l1_data_bank_wdata[cb] ), + .tcdm_data_bank_be_o (l1_data_bank_be[cb] ), + .tcdm_data_bank_rdata_i(l1_data_bank_rdata[cb] ), + .tcdm_data_bank_gnt_i (l1_data_bank_gnt[cb] ) + ); + + for (genvar j = 0; j < NumTagBankPerCtrl; j++) begin + tc_sram_impl #( + .NumWords (L1CacheWayEntry/L1BankFactor), + .DataWidth ($bits(tag_data_t) ), + .ByteWidth ($bits(tag_data_t) ), + .NumPorts (1 ), + .Latency (1 ), + .SimInit ("zeros" ), + .impl_in_t (impl_in_t ) + ) i_meta_bank ( + .clk_i (clk_i ), + .rst_ni (rst_ni ), + .impl_i ('0 ), + .impl_o (/* unsed */ ), + .req_i (l1_tag_bank_req [cb][j]), + .we_i (l1_tag_bank_we [cb][j]), + .addr_i (l1_tag_bank_addr [cb][j]), + .wdata_i(l1_tag_bank_wdata[cb][j]), + .be_i (l1_tag_bank_be [cb][j]), + .rdata_o(l1_tag_bank_rdata[cb][j]) + ); + end + + for (genvar j = 0; j < NumDataBankPerCtrl; j++) begin : gen_l1_data_banks + tc_sram_impl #( + .NumWords (L1CacheWayEntry/L1BankFactor), + .DataWidth (DataWidth), + .ByteWidth (DataWidth), + .NumPorts (1), + .Latency (1), + .SimInit ("zeros") + ) i_data_bank ( + .clk_i (clk_i ), + .rst_ni (rst_ni ), + .impl_i ('0 ), + .impl_o (/* unsed */ ), + .req_i (l1_data_bank_req [cb][j]), + .we_i (l1_data_bank_we [cb][j]), + .addr_i (l1_data_bank_addr [cb][j]), + .wdata_i(l1_data_bank_wdata[cb][j]), + .be_i (l1_data_bank_be [cb][j]), + .rdata_o(l1_data_bank_rdata[cb][j]) + ); + + assign l1_data_bank_gnt[cb][j] = 1'b1; + end + end + + // Hong TODO: Now we multiplex axi requests from multiple cache controllers, + // the id width is prepended to the existing id value to tell which + // cache controller request is picked, later we can replace this if + // we have some more appealing mux policy. + axi_mux #( + .SlvAxiIDWidth ( WideIdWidthIn-$clog2(NumL1CacheCtrl) ), // ID width of the slave ports + .slv_aw_chan_t ( axi_dcache_aw_chan_t ), // AW Channel Type, slave ports + .mst_aw_chan_t ( axi_mst_dma_aw_chan_t ), // AW Channel Type, master port + .w_chan_t ( axi_mst_dma_w_chan_t ), // W Channel Type, all ports + .slv_b_chan_t ( axi_dcache_b_chan_t ), // B Channel Type, slave ports + .mst_b_chan_t ( axi_mst_dma_b_chan_t ), // B Channel Type, master port + .slv_ar_chan_t ( axi_dcache_ar_chan_t ), // AR Channel Type, slave ports + .mst_ar_chan_t ( axi_mst_dma_ar_chan_t ), // AR Channel Type, master port + .slv_r_chan_t ( axi_dcache_r_chan_t ), // R Channel Type, slave ports + .mst_r_chan_t ( axi_mst_dma_r_chan_t ), // R Channel Type, master port + .slv_req_t ( axi_dcache_req_t ), + .slv_resp_t ( axi_dcache_resp_t ), + .mst_req_t ( axi_mst_dma_req_t ), + .mst_resp_t ( axi_mst_dma_resp_t ), + .NoSlvPorts ( NumL1CacheCtrl ) // Number of Masters for the module + ) i_dcache_axi_mux ( + .clk_i ( clk_i ), + .rst_ni ( rst_ni ), + .test_i ( 1'b0 ), // Test Mode enable + .slv_reqs_i ( dcache_axi_req ), + .slv_resps_o ( dcache_axi_rsp ), + .mst_req_o ( wide_axi_mst_req[DCache] ), + .mst_resp_i ( wide_axi_mst_rsp[DCache] ) + ); + + + spatz_tcdm_interconnect #( + .NumInp (NumTCDMIn ), + .NumOut (L1NumWrapper ), + .tcdm_req_t (spm_req_t ), + .tcdm_rsp_t (spm_rsp_t ), + .mem_req_t (mem_req_t ), + .mem_rsp_t (mem_rsp_t ), + .MemAddrWidth (TCDMMemAddrWidth ), + .DataWidth (DataWidth ), + .user_t (tcdm_user_t ), + .MemoryResponseLatency (1 + RegisterTCDMCuts) + ) i_tcdm_interconnect ( + .clk_i (clk_i ), + .rst_ni (rst_ni ), + .req_i ({axi_soc_req, spm_req} ), + .rsp_o ({axi_soc_rsp, spm_rsp} ), + .mem_req_o (ic_req ), + .mem_rsp_i (ic_rsp ) + ); + + hive_req_t [NrCores-1:0] hive_req; + hive_rsp_t [NrCores-1:0] hive_rsp; + + for (genvar i = 0; i < NrCores; i++) begin : gen_core + localparam int unsigned TcdmPorts = get_tcdm_ports(i); + localparam int unsigned TcdmPortsOffs = get_tcdm_port_offs(i); + + axi_mst_dma_req_t axi_dma_req; + axi_mst_dma_resp_t axi_dma_res; + interrupts_t irq; + dma_events_t dma_core_events; + + sync #(.STAGES (2)) + i_sync_debug (.clk_i, .rst_ni, .serial_i (debug_req_i[i]), .serial_o (irq.debug)); + sync #(.STAGES (2)) + i_sync_meip (.clk_i, .rst_ni, .serial_i (meip_i[i]), .serial_o (irq.meip)); + sync #(.STAGES (2)) + i_sync_mtip (.clk_i, .rst_ni, .serial_i (mtip_i[i]), .serial_o (irq.mtip)); + sync #(.STAGES (2)) + i_sync_msip (.clk_i, .rst_ni, .serial_i (msip_i[i]), .serial_o (irq.msip)); + assign irq.mcip = cl_interrupt[i]; + + tcdm_req_t [TcdmPorts-1:0] tcdm_req_wo_user; + + logic [31:0] hart_id; + assign hart_id = hart_base_id_i + i; + + spatz_cc #( + .BootAddr (BootAddr ), + .L2Addr (L2Addr ), + .L2Size (L2Size ), + .RVE (1'b0 ), + .RVF (RVF ), + .RVD (RVD ), + .RVV (RVV ), + .Xdma (Xdma[i] ), + .AddrWidth (AxiAddrWidth ), + .DataWidth (NarrowDataWidth ), + .UserWidth (AxiUserWidth ), + .DMADataWidth (AxiDataWidth ), + .DMAIdWidth (AxiIdWidthIn ), + .SnitchPMACfg (SnitchPMACfg ), + .DMAAxiReqFifoDepth (DMAAxiReqFifoDepth ), + .DMAReqFifoDepth (DMAReqFifoDepth ), + .dreq_t (reqrsp_req_t ), + .drsp_t (reqrsp_rsp_t ), + .tcdm_req_t (tcdm_req_t ), + .tcdm_req_chan_t (tcdm_req_chan_t ), + .tcdm_rsp_t (tcdm_rsp_t ), + .tcdm_rsp_chan_t (tcdm_rsp_chan_t ), + .axi_req_t (axi_mst_dma_req_t ), + .axi_ar_chan_t (axi_mst_dma_ar_chan_t ), + .axi_aw_chan_t (axi_mst_dma_aw_chan_t ), + .axi_rsp_t (axi_mst_dma_resp_t ), + .hive_req_t (hive_req_t ), + .hive_rsp_t (hive_rsp_t ), + .acc_issue_req_t (acc_issue_req_t ), + .acc_issue_rsp_t (acc_issue_rsp_t ), + .acc_rsp_t (acc_rsp_t ), + .dma_events_t (dma_events_t ), + .dma_perf_t (axi_dma_pkg::dma_perf_t ), + .XDivSqrt (1'b0 ), + .XF16 (1'b1 ), + .XF16ALT (1'b1 ), + .XF8 (1'b1 ), + .XF8ALT (1'b1 ), + .IsoCrossing (1'b0 ), + .NumIntOutstandingLoads (NumIntOutstandingLoads[i] ), + .NumIntOutstandingMem (NumIntOutstandingMem[i] ), + .NumSpatzOutstandingLoads(NumSpatzOutstandingLoads[i]), + .FPUImplementation (FPUImplementation[i] ), + .RegisterOffloadRsp (RegisterOffloadRsp ), + .RegisterCoreReq (RegisterCoreReq ), + .RegisterCoreRsp (RegisterCoreRsp ), + .NumSpatzFPUs (NumSpatzFPUs ), + .NumSpatzIPUs (NumSpatzIPUs ), + .TCDMAddrWidth (SPMAddrWidth ) + ) i_spatz_cc ( + .clk_i (clk_i ), + .rst_ni (rst_ni ), + .testmode_i (1'b0 ), + .hart_id_i (hart_id ), + .hive_req_o (hive_req[i] ), + .hive_rsp_i (hive_rsp[i] ), + .irq_i (irq ), + .data_req_o (core_req[i] ), + .data_rsp_i (core_rsp[i] ), + .tcdm_req_o (tcdm_req_wo_user ), + .tcdm_rsp_i (tcdm_rsp[TcdmPortsOffs +: TcdmPorts]), + .axi_dma_req_o (axi_dma_req ), + .axi_dma_res_i (axi_dma_res ), + .axi_dma_busy_o (/* Unused */ ), + .axi_dma_perf_o (/* Unused */ ), + .axi_dma_events_o (dma_core_events ), + .core_events_o (core_events[i] ), + .tcdm_addr_base_i (tcdm_start_address ) + ); + for (genvar j = 0; j < TcdmPorts; j++) begin : gen_tcdm_user + always_comb begin + tcdm_req[TcdmPortsOffs+j].q = tcdm_req_wo_user[j].q; + tcdm_req[TcdmPortsOffs+j].q.user.core_id = i[CoreIDWidth-1:0]; + tcdm_req[TcdmPortsOffs+j].q.user.is_core = 1; + tcdm_req[TcdmPortsOffs+j].q_valid = tcdm_req_wo_user[j].q_valid; + end + end + if (Xdma[i]) begin : gen_dma_connection + assign wide_axi_mst_req[SDMAMst] = axi_dma_req; + assign axi_dma_res = wide_axi_mst_rsp[SDMAMst]; + assign dma_events = dma_core_events; + end else begin + assign axi_dma_res = '0; + end + end + + // ---------------- + // Instruction Cache + // ---------------- + + addr_t [NrCores-1:0] inst_addr; + logic [NrCores-1:0] inst_cacheable; + logic [NrCores-1:0][31:0] inst_data; + logic [NrCores-1:0] inst_valid; + logic [NrCores-1:0] inst_ready; + logic [NrCores-1:0] inst_error; + logic [NrCores-1:0] flush_valid; + logic [NrCores-1:0] flush_ready; + + for (genvar i = 0; i < NrCores; i++) begin : gen_unpack_icache + assign inst_addr[i] = hive_req[i].inst_addr; + assign inst_cacheable[i] = hive_req[i].inst_cacheable; + assign inst_valid[i] = hive_req[i].inst_valid; + assign flush_valid[i] = hive_req[i].flush_i_valid; + assign hive_rsp[i] = '{ + inst_data : inst_data[i], + inst_ready : inst_ready[i], + inst_error : inst_error[i], + flush_i_ready: flush_ready[i], + default : '0 + }; + end + + snitch_icache #( + .NR_FETCH_PORTS ( NrCores ), + .L0_LINE_COUNT ( 8 ), + .LINE_WIDTH ( ICacheLineWidth ), + .LINE_COUNT ( ICacheLineCount ), + .SET_COUNT ( ICacheSets ), + .FETCH_AW ( AxiAddrWidth ), + .FETCH_DW ( 32 ), + .FILL_AW ( AxiAddrWidth ), + .FILL_DW ( AxiDataWidth ), + .EARLY_LATCH ( 0 ), + .L0_EARLY_TAG_WIDTH ( snitch_pkg::PAGE_SHIFT - $clog2(ICacheLineWidth/8) ), + .ISO_CROSSING ( 1'b0 ), + .axi_req_t ( axi_mst_dma_req_t ), + .axi_rsp_t ( axi_mst_dma_resp_t ), + .sram_cfg_data_t ( impl_in_t ), + .sram_cfg_tag_t ( impl_in_t ) + ) i_snitch_icache ( + .clk_i ( clk_i ), + .clk_d2_i ( clk_i ), + .rst_ni ( rst_ni ), + .enable_prefetching_i ( icache_prefetch_enable ), + .icache_events_o ( icache_events ), + .flush_valid_i ( flush_valid ), + .flush_ready_o ( flush_ready ), + .inst_addr_i ( inst_addr ), + .inst_cacheable_i ( inst_cacheable ), + .inst_data_o ( inst_data ), + .inst_valid_i ( inst_valid ), + .inst_ready_o ( inst_ready ), + .inst_error_o ( inst_error ), + .sram_cfg_tag_i ( '0 ), + .sram_cfg_data_i ( '0 ), + .axi_req_o ( wide_axi_mst_req[ICache] ), + .axi_rsp_i ( wide_axi_mst_rsp[ICache] ) + ); + + // -------- + // Cores SoC + // -------- + spatz_barrier #( + .AddrWidth (AxiAddrWidth ), + .NrPorts (NrCores ), + .dreq_t (reqrsp_req_t ), + .drsp_t (reqrsp_rsp_t ) + ) i_snitch_barrier ( + .clk_i (clk_i ), + .rst_ni (rst_ni ), + .in_req_i (core_req ), + .in_rsp_o (core_rsp ), + .out_req_o (filtered_core_req ), + .out_rsp_i (filtered_core_rsp ), + .cluster_periph_start_address_i (cluster_periph_start_address) + ); + + reqrsp_req_t core_to_axi_req; + reqrsp_rsp_t core_to_axi_rsp; + user_t cluster_user; + // Atomic ID, needs to be unique ID of cluster + // cluster_id + HartIdOffset + 1 (because 0 is for non-atomic masters) + assign cluster_user = (hart_base_id_i / NrCores) + (hart_base_id_i % NrCores) + 1'b1; + + reqrsp_mux #( + .NrPorts (NrCores ), + .AddrWidth (AxiAddrWidth ), + .DataWidth (NarrowDataWidth ), + .req_t (reqrsp_req_t ), + .rsp_t (reqrsp_rsp_t ), + .RespDepth (2 ) + ) i_reqrsp_mux_core ( + .clk_i (clk_i ), + .rst_ni (rst_ni ), + .slv_req_i (filtered_core_req), + .slv_rsp_o (filtered_core_rsp), + .mst_req_o (core_to_axi_req ), + .mst_rsp_i (core_to_axi_rsp ), + .idx_o (/*unused*/ ) + ); + + reqrsp_to_axi #( + .DataWidth (NarrowDataWidth), + .UserWidth (NarrowUserWidth), + .reqrsp_req_t (reqrsp_req_t ), + .reqrsp_rsp_t (reqrsp_rsp_t ), + .axi_req_t (axi_mst_req_t ), + .axi_rsp_t (axi_mst_resp_t ) + ) i_reqrsp_to_axi_core ( + .clk_i (clk_i ), + .rst_ni (rst_ni ), + .user_i (cluster_user ), + .reqrsp_req_i (core_to_axi_req ), + .reqrsp_rsp_o (core_to_axi_rsp ), + .axi_req_o (narrow_axi_mst_req[CoreReq]), + .axi_rsp_i (narrow_axi_mst_rsp[CoreReq]) + ); + + xbar_rule_t [NrNarrowRules-1:0] cluster_xbar_rules; + + assign cluster_xbar_rules = '{ + '{ + idx : TCDM, + start_addr: tcdm_start_address, + end_addr : tcdm_end_address + }, + '{ + idx : ClusterPeripherals, + start_addr: cluster_periph_start_address, + end_addr : cluster_periph_end_address + } + }; + + localparam bit [ClusterXbarCfg.NoSlvPorts-1:0] ClusterEnableDefaultMstPort = '1; + localparam logic [ClusterXbarCfg.NoSlvPorts-1:0][cf_math_pkg::idx_width(ClusterXbarCfg.NoMstPorts)-1:0] ClusterXbarDefaultPort = '{default: SoC}; + + axi_xbar #( + .Cfg (ClusterXbarCfg ), + .slv_aw_chan_t (axi_mst_aw_chan_t), + .mst_aw_chan_t (axi_slv_aw_chan_t), + .w_chan_t (axi_mst_w_chan_t ), + .slv_b_chan_t (axi_mst_b_chan_t ), + .mst_b_chan_t (axi_slv_b_chan_t ), + .slv_ar_chan_t (axi_mst_ar_chan_t), + .mst_ar_chan_t (axi_slv_ar_chan_t), + .slv_r_chan_t (axi_mst_r_chan_t ), + .mst_r_chan_t (axi_slv_r_chan_t ), + .slv_req_t (axi_mst_req_t ), + .slv_resp_t (axi_mst_resp_t ), + .mst_req_t (axi_slv_req_t ), + .mst_resp_t (axi_slv_resp_t ), + .rule_t (xbar_rule_t ) + ) i_cluster_xbar ( + .clk_i (clk_i ), + .rst_ni (rst_ni ), + .test_i (1'b0 ), + .slv_ports_req_i (narrow_axi_mst_req ), + .slv_ports_resp_o (narrow_axi_mst_rsp ), + .mst_ports_req_o (narrow_axi_slv_req ), + .mst_ports_resp_i (narrow_axi_slv_rsp ), + .addr_map_i (cluster_xbar_rules ), + .en_default_mst_port_i (ClusterEnableDefaultMstPort), + .default_mst_port_i (ClusterXbarDefaultPort ) + ); + + // --------- + // Slaves + // --------- + // 1. TCDM + // Add an adapter that allows access from AXI to the TCDM. + axi_to_tcdm #( + .axi_req_t (axi_slv_req_t ), + .axi_rsp_t (axi_slv_resp_t ), + .tcdm_req_t (spm_req_t ), + .tcdm_rsp_t (spm_rsp_t ), + .AddrWidth (AxiAddrWidth ), + .DataWidth (NarrowDataWidth ), + .IdWidth (NarrowIdWidthOut ), + .BufDepth (MemoryMacroLatency + 1) + ) i_axi_to_tcdm ( + .clk_i (clk_i ), + .rst_ni (rst_ni ), + .axi_req_i (narrow_axi_slv_req[TCDM]), + .axi_rsp_o (narrow_axi_slv_rsp[TCDM]), + .tcdm_req_o (axi_soc_req ), + .tcdm_rsp_i (axi_soc_rsp ) + ); + + // 2. Peripherals + axi_to_reg #( + .ADDR_WIDTH (AxiAddrWidth ), + .DATA_WIDTH (NarrowDataWidth ), + .AXI_MAX_WRITE_TXNS (1 ), + .AXI_MAX_READ_TXNS (1 ), + .DECOUPLE_W (0 ), + .ID_WIDTH (NarrowIdWidthOut ), + .USER_WIDTH (NarrowUserWidth ), + .axi_req_t (axi_slv_req_t ), + .axi_rsp_t (axi_slv_resp_t ), + .reg_req_t (reg_req_t ), + .reg_rsp_t (reg_rsp_t ) + ) i_axi_to_reg ( + .clk_i (clk_i ), + .rst_ni (rst_ni ), + .testmode_i (1'b0 ), + .axi_req_i (narrow_axi_slv_req[ClusterPeripherals]), + .axi_rsp_o (narrow_axi_slv_rsp[ClusterPeripherals]), + .reg_req_o (reg_req ), + .reg_rsp_i (reg_rsp ) + ); + + spatz_cluster_peripheral #( + .AddrWidth (AxiAddrWidth ), + .SPMWidth ($clog2(L1NumSet)), + .reg_req_t (reg_req_t ), + .reg_rsp_t (reg_rsp_t ), + .tcdm_events_t (tcdm_events_t ), + .dma_events_t (dma_events_t ), + .NrCores (NrCores ) + ) i_snitch_cluster_peripheral ( + .clk_i (clk_i ), + .rst_ni (rst_ni ), + .reg_req_i (reg_req ), + .reg_rsp_o (reg_rsp ), + /// The TCDM always starts at the cluster base. + .tcdm_start_address_i (tcdm_start_address ), + .tcdm_end_address_i (tcdm_end_address ), + .icache_prefetch_enable_o (icache_prefetch_enable), + .cl_clint_o (cl_interrupt ), + .cluster_hart_base_id_i (hart_base_id_i ), + .core_events_i (core_events ), + .tcdm_events_i (tcdm_events ), + .dma_events_i (dma_events ), + .icache_events_i (icache_events ), + .cluster_probe_o (cluster_probe_o ), + .dynamic_offset_o (dynamic_offset ), + .l1d_spm_size_o (cfg_spm_size ), + .l1d_insn_o (l1d_insn ), + .l1d_insn_valid_o (l1d_insn_valid ), + // TODO: Here we only check controller 0 + .l1d_insn_ready_i (l1d_insn_ready[0] ), + .l1d_busy_o (l1d_busy ) + ); + + // 3. BootROM + axi_to_reg #( + .ADDR_WIDTH (AxiAddrWidth ), + .DATA_WIDTH (AxiDataWidth ), + .AXI_MAX_WRITE_TXNS (1 ), + .AXI_MAX_READ_TXNS (1 ), + .DECOUPLE_W (0 ), + .ID_WIDTH (WideIdWidthOut ), + .USER_WIDTH (AxiUserWidth ), + .axi_req_t (axi_slv_dma_req_t ), + .axi_rsp_t (axi_slv_dma_resp_t), + .reg_req_t (reg_dma_req_t ), + .reg_rsp_t (reg_dma_rsp_t ) + ) i_axi_to_reg_bootrom ( + .clk_i (clk_i ), + .rst_ni (rst_ni ), + .testmode_i (1'b0 ), + .axi_req_i (wide_axi_slv_req[BootROM]), + .axi_rsp_o (wide_axi_slv_rsp[BootROM]), + .reg_req_o (bootrom_reg_req ), + .reg_rsp_i (bootrom_reg_rsp ) + ); + + bootrom i_bootrom ( + .clk_i (clk_i ), + .req_i (bootrom_reg_req.valid ), + .addr_i (addr_t'(bootrom_reg_req.addr)), + .rdata_o(bootrom_reg_rsp.rdata ) + ); + `FF(bootrom_reg_rsp.ready, bootrom_reg_req.valid, 1'b0) + assign bootrom_reg_rsp.error = 1'b0; + + // Upsize the narrow SoC connection + `AXI_TYPEDEF_ALL(axi_mst_dma_narrow, addr_t, id_dma_mst_t, data_t, strb_t, user_t) + axi_mst_dma_narrow_req_t narrow_axi_slv_req_soc; + axi_mst_dma_narrow_resp_t narrow_axi_slv_resp_soc; + + axi_iw_converter #( + .AxiAddrWidth (AxiAddrWidth ), + .AxiDataWidth (NarrowDataWidth ), + .AxiUserWidth (AxiUserWidth ), + .AxiSlvPortIdWidth (NarrowIdWidthOut ), + .AxiSlvPortMaxUniqIds (1 ), + .AxiSlvPortMaxTxnsPerId(1 ), + .AxiSlvPortMaxTxns (1 ), + .AxiMstPortIdWidth (WideIdWidthIn ), + .AxiMstPortMaxUniqIds (1 ), + .AxiMstPortMaxTxnsPerId(1 ), + .slv_req_t (axi_slv_req_t ), + .slv_resp_t (axi_slv_resp_t ), + .mst_req_t (axi_mst_dma_narrow_req_t ), + .mst_resp_t (axi_mst_dma_narrow_resp_t) + ) i_soc_port_iw_convert ( + .clk_i (clk_i ), + .rst_ni (rst_ni ), + .slv_req_i (narrow_axi_slv_req[SoC] ), + .slv_resp_o (narrow_axi_slv_rsp[SoC] ), + .mst_req_o (narrow_axi_slv_req_soc ), + .mst_resp_i (narrow_axi_slv_resp_soc ) + ); + + axi_dw_converter #( + .AxiAddrWidth (AxiAddrWidth ), + .AxiIdWidth (WideIdWidthIn ), + .AxiMaxReads (2 ), + .AxiSlvPortDataWidth(NarrowDataWidth ), + .AxiMstPortDataWidth(AxiDataWidth ), + .ar_chan_t (axi_mst_dma_ar_chan_t ), + .aw_chan_t (axi_mst_dma_aw_chan_t ), + .b_chan_t (axi_mst_dma_b_chan_t ), + .slv_r_chan_t (axi_mst_dma_narrow_r_chan_t), + .slv_w_chan_t (axi_mst_dma_narrow_b_chan_t), + .axi_slv_req_t (axi_mst_dma_narrow_req_t ), + .axi_slv_resp_t (axi_mst_dma_narrow_resp_t ), + .mst_r_chan_t (axi_mst_dma_r_chan_t ), + .mst_w_chan_t (axi_mst_dma_w_chan_t ), + .axi_mst_req_t (axi_mst_dma_req_t ), + .axi_mst_resp_t (axi_mst_dma_resp_t ) + ) i_soc_port_dw_upsize ( + .clk_i (clk_i ), + .rst_ni (rst_ni ), + .slv_req_i (narrow_axi_slv_req_soc ), + .slv_resp_o (narrow_axi_slv_resp_soc ), + .mst_req_o (wide_axi_mst_req[CoreReqWide]), + .mst_resp_i (wide_axi_mst_rsp[CoreReqWide]) + ); + + // -------------------- + // TCDM event counters + // -------------------- + logic [NrTCDMPortsCores-1:0] flat_acc, flat_con; + for (genvar i = 0; i < NrTCDMPortsCores; i++) begin : gen_event_counter + `FFARN(flat_acc[i], tcdm_req[i].q_valid, '0, clk_i, rst_ni) + `FFARN(flat_con[i], tcdm_req[i].q_valid & ~tcdm_rsp[i].q_ready, '0, clk_i, rst_ni) + end + + popcount #( + .INPUT_WIDTH ( NrTCDMPortsCores ) + ) i_popcount_req ( + .data_i ( flat_acc ), + .popcount_o ( tcdm_events.inc_accessed ) + ); + + popcount #( + .INPUT_WIDTH ( NrTCDMPortsCores ) + ) i_popcount_con ( + .data_i ( flat_con ), + .popcount_o ( tcdm_events.inc_congested ) + ); + + // ------------- + // Sanity Checks + // ------------- + // Sanity check the parameters. Not every configuration makes sense. + `ASSERT_INIT(CheckSuperBankSanity, NrBanks >= BanksPerSuperBank); + `ASSERT_INIT(CheckSuperBankFactor, (NrBanks % BanksPerSuperBank) == 0); + // Check that the cluster base address aligns to the TCDMSize. + `ASSERT(ClusterBaseAddrAlign, ((TCDMSize - 1) & cluster_base_addr_i) == 0) + // Make sure we only have one DMA in the system. + `ASSERT_INIT(NumberDMA, $onehot0(Xdma)) + +endmodule diff --git a/hardware/src/cachepool_pkg.sv b/hardware/src/cachepool_pkg.sv new file mode 100644 index 0000000..d147e35 --- /dev/null +++ b/hardware/src/cachepool_pkg.sv @@ -0,0 +1,193 @@ +// Copyright 2021 ETH Zurich and University of Bologna. +// Solderpad Hardware License, Version 0.51, see LICENSE for details. +// SPDX-License-Identifier: SHL-0.51 + +package cachepool_pkg; + import fpnew_pkg::*; + + /********************* + * TILE PARAMETERS * + *********************/ + + `include "axi/assign.svh" + `include "axi/typedef.svh" + + localparam int unsigned NumTiles = 1; + + /////////// + // AXI // + /////////// + + // AXI Data Width + localparam int unsigned SpatzAxiDataWidth = 256; + localparam int unsigned SpatzAxiStrbWidth = SpatzAxiDataWidth / 8; + localparam int unsigned SpatzAxiNarrowDataWidth = 64; + // AXI Address Width + localparam int unsigned SpatzAxiAddrWidth = 32; + // AXI ID Width + localparam int unsigned SpatzAxiIdInWidth = 6; + localparam int unsigned SpatzAxiIdOutWidth = 2; + + // FIXED AxiIdOutWidth + // Add 3 because of cache controller (second-level xbar, 4 cache, 1 old port) + localparam int unsigned IwcAxiIdOutWidth = 3 + $clog2(4) + 3; + + // AXI User Width + localparam int unsigned SpatzAxiUserWidth = 10; + + + typedef logic [SpatzAxiDataWidth-1:0] axi_data_t; + typedef logic [SpatzAxiStrbWidth-1:0] axi_strb_t; + typedef logic [SpatzAxiAddrWidth-1:0] axi_addr_t; + typedef logic [SpatzAxiIdInWidth-1:0] axi_id_in_t; + typedef logic [SpatzAxiIdOutWidth-1:0] axi_id_out_t; + typedef logic [SpatzAxiUserWidth-1:0] axi_user_t; + + + `AXI_TYPEDEF_ALL(spatz_axi_in, axi_addr_t, axi_id_in_t, logic [63:0], logic [7:0], axi_user_t) + `AXI_TYPEDEF_ALL(spatz_axi_out, axi_addr_t, axi_id_out_t, axi_data_t, axi_strb_t, axi_user_t) + + typedef logic [IwcAxiIdOutWidth-1:0] axi_id_out_iwc_t; + + `AXI_TYPEDEF_ALL(spatz_axi_iwc_out, axi_addr_t, axi_id_out_iwc_t, axi_data_t, axi_strb_t, axi_user_t) + + //////////////////// + // Spatz Cluster // + //////////////////// + + localparam int unsigned NumCores = 4; + // TODO: read from CFG + localparam int unsigned NumBank = 16; + localparam int unsigned TCDMDepth = 2048; + // localparam int unsigned TCDMDepth = 4096; + + localparam int unsigned SpatzDataWidth = 64; + localparam int unsigned BeWidth = SpatzDataWidth / 8; + localparam int unsigned ByteOffset = $clog2(BeWidth); + + localparam int unsigned ICacheLineWidth = 128; + localparam int unsigned ICacheLineCount = 128; + localparam int unsigned ICacheSets = 2; + + localparam int unsigned TCDMStartAddr = 32'h5100_0000; + localparam int unsigned TCDMSize = 32'h4_0000; + + localparam int unsigned PeriStartAddr = TCDMStartAddr + TCDMSize; + + localparam int unsigned BootAddr = 32'h1000; + + // L2 Configuration + localparam int unsigned L2Addr = 48'h5180_0000; + localparam int unsigned L2Size = 48'h0080_0000; + + function automatic snitch_pma_pkg::rule_t [snitch_pma_pkg::NrMaxRules-1:0] get_cached_regions(); + automatic snitch_pma_pkg::rule_t [snitch_pma_pkg::NrMaxRules-1:0] cached_regions; + cached_regions = '{default: '0}; + cached_regions[0] = '{base: 32'h80000000, mask: 32'h80000000}; + cached_regions[1] = '{base: 32'h51800000, mask: 32'hff800000}; + return cached_regions; + endfunction + + localparam snitch_pma_pkg::snitch_pma_t SnitchPMACfg = '{ + NrCachedRegionRules: 2, + CachedRegion: get_cached_regions(), + default: 0 + }; + + ///////////////// + // Spatz Core // + ///////////////// + + localparam int unsigned NFpu = 4; + localparam int unsigned NIpu = 4; + + + localparam fpu_implementation_t FPUImplementation_Core = '{ + // FMA Block + PipeRegs: '{ + // FP32 FP64 FP16 FP8 FP16A FP8A + '{ 1, 2, 1, 0, 1, 0}, // ADDMUL + '{ 1, 1, 1, 1, 1, 1}, // DIVSQRT + '{ 1, 1, 1, 1, 1, 1}, // NONCOMP + '{ 2, 2, 2, 2, 2, 2}, // CONV + '{ 4, 4, 4, 4, 4, 4} // DOTP + }, + UnitTypes: '{ + '{ MERGED, MERGED, MERGED, MERGED, MERGED, MERGED }, // FMA + '{ DISABLED, DISABLED, DISABLED, DISABLED, DISABLED, DISABLED }, // DIVSQRT + '{ PARALLEL, PARALLEL, PARALLEL, PARALLEL, PARALLEL, PARALLEL }, // NONCOMP + '{ MERGED, MERGED, MERGED, MERGED, MERGED, MERGED }, // CONV + '{ MERGED, MERGED, MERGED, MERGED, MERGED, MERGED } // DOTP + }, + PipeConfig: BEFORE + }; + + localparam fpu_implementation_t FPUImplementation [NumCores] = '{default: FPUImplementation_Core}; + + //////////////////// + // CachePool L1 // + //////////////////// + + // Address width of cache + localparam int unsigned L1AddrWidth = 32; + // Cache lane width + localparam int unsigned L1LineWidth = SpatzAxiDataWidth; + // Coalecser window + localparam int unsigned L1CoalFactor = 2; + // Total number of Data banks + localparam int unsigned L1NumDataBank = 128; + // Number of bank wraps SPM can see + localparam int unsigned L1NumWrapper = NumBank; + // SPM view: Number of banks in each bank wrap (Use to mitigate routing complexity of such many banks) + localparam int unsigned L1BankPerWP = L1NumDataBank / NumBank; + // Pesudo dual bank + localparam int unsigned L1BankFactor = 2; + // Cache ways (total way number across multiple cache controllers) + localparam int unsigned L1Associativity = L1NumDataBank / (L1LineWidth / SpatzDataWidth) / L1BankFactor; + // 8 * 1024 * 64 / 512 = 1024) + // Number of entrys of L1 Cache (total number across multiple cache controllers) + localparam int unsigned L1NumEntry = NumBank * TCDMDepth * SpatzDataWidth / L1LineWidth; + // Number of cache entries each cache way has + localparam int unsigned L1CacheWayEntry = L1NumEntry / L1Associativity; + // Number of cache sets each cache way has + localparam int unsigned L1NumSet = L1CacheWayEntry / L1BankFactor; + // Number of Tag banks + localparam int unsigned L1NumTagBank = L1BankFactor * L1Associativity; + // Number of lines per bank unit + localparam int unsigned DepthPerBank = TCDMDepth / L1BankPerWP; + // Cache total size in KB + localparam int unsigned L1Size = NumBank * TCDMDepth * BeWidth / 1024; + + // Number of cache controller (now is fixde to NrCores (if we change it, we need to change the controller axi output id width too) + localparam int unsigned NumL1CacheCtrl = NumCores; + // Number of data banks assigned to each cache controller + localparam int unsigned NumDataBankPerCtrl = L1NumDataBank / NumL1CacheCtrl; + // Number of tag banks assigned to each cache controller + localparam int unsigned NumTagBankPerCtrl = L1NumTagBank / NumL1CacheCtrl; + // Number of ways per cache controller + localparam int unsigned L1AssoPerCtrl = L1Associativity / NumL1CacheCtrl; + // Number of entries per cache controller + localparam int unsigned L1NumEntryPerCtrl = L1NumEntry / NumL1CacheCtrl; + + // Do we need to keep DMA here? + localparam int unsigned NumTileWideAxi = 2; + typedef enum integer { + TileBootROM = 0, + TileMem = 1 + } tile_wide_e; + + localparam int unsigned NumTileNarrowAxi = 1; + typedef enum integer { + TilePeriph = 0 + } tile_narrow_e; + + // TODO: multi-tile support + localparam int unsigned NumClusterAxiMst = 1 + NumL1CacheCtrl; + localparam int unsigned NumClusterAxiSlv = 2; + + typedef enum integer { + ClusterL2 = 0, + ClusterL3 = 1 + } cluster_slv_e; + +endpackage : cachepool_pkg diff --git a/hardware/src/cachepool_tile.sv b/hardware/src/cachepool_tile.sv new file mode 100644 index 0000000..744b807 --- /dev/null +++ b/hardware/src/cachepool_tile.sv @@ -0,0 +1,1527 @@ +// Copyright 2025 ETH Zurich and University of Bologna. +// Solderpad Hardware License, Version 0.51, see LICENSE for details. +// SPDX-License-Identifier: SHL-0.51 + +// Author: Diyou Shen + +`include "axi/assign.svh" +`include "axi/typedef.svh" +`include "common_cells/assertions.svh" +`include "common_cells/registers.svh" +`include "mem_interface/assign.svh" +`include "mem_interface/typedef.svh" +`include "register_interface//assign.svh" +`include "register_interface/typedef.svh" +`include "reqrsp_interface/assign.svh" +`include "reqrsp_interface/typedef.svh" +`include "snitch_vm/typedef.svh" +`include "tcdm_interface/assign.svh" +`include "tcdm_interface/typedef.svh" + +/// Tile implementation for CachePool +module cachepool_tile + import cachepool_pkg::*; + import spatz_pkg::*; + import fpnew_pkg::fpu_implementation_t; + import snitch_pma_pkg::snitch_pma_t; + #( + /// Width of physical address. + parameter int unsigned AxiAddrWidth = 48, + /// Width of AXI port. + parameter int unsigned AxiDataWidth = 512, + /// AXI: id width in. + parameter int unsigned AxiIdWidthIn = 2, + /// AXI: id width out. + parameter int unsigned AxiIdWidthOut = 2, + /// AXI: user width. + parameter int unsigned AxiUserWidth = 1, + /// Address from which to fetch the first instructions. + parameter logic [31:0] BootAddr = 32'h0, + /// Address to indicate start of L2 + parameter logic [AxiAddrWidth-1:0] L2Addr = 48'h0, + parameter logic [AxiAddrWidth-1:0] L2Size = 48'h0, + /// The total amount of cores. + parameter int unsigned NrCores = 8, + /// Data/TCDM memory depth per cut (in words). + parameter int unsigned TCDMDepth = 1024, + /// Cluster peripheral address region size (in kB). + parameter int unsigned ClusterPeriphSize = 64, + /// Number of TCDM Banks. + parameter int unsigned NrBanks = 2 * NrCores, + /// Size of DMA AXI buffer. + parameter int unsigned DMAAxiReqFifoDepth = 3, + /// Size of DMA request fifo. + parameter int unsigned DMAReqFifoDepth = 3, + /// Width of a single icache line. + parameter unsigned ICacheLineWidth = 0, + /// Number of icache lines per set. + parameter int unsigned ICacheLineCount = 0, + /// Number of icache sets. + parameter int unsigned ICacheSets = 0, + // PMA Configuration + parameter snitch_pma_t SnitchPMACfg = '{default: 0}, + /// # Core-global parameters + /// FPU configuration. + parameter fpu_implementation_t FPUImplementation [NrCores] = '{default: fpu_implementation_t'(0)}, + /// Spatz FPU/IPU Configuration + parameter int unsigned NumSpatzFPUs = 4, + parameter int unsigned NumSpatzIPUs = 1, + /// Per-core enabling of the custom `Xdma` ISA extensions. + parameter bit [NrCores-1:0] Xdma = '{default: '0}, + /// # Per-core parameters + /// Per-core integer outstanding loads + parameter int unsigned NumIntOutstandingLoads [NrCores] = '{default: '0}, + /// Per-core integer outstanding memory operations (load and stores) + parameter int unsigned NumIntOutstandingMem [NrCores] = '{default: '0}, + /// Per-core Spatz outstanding loads + parameter int unsigned NumSpatzOutstandingLoads [NrCores] = '{default: '0}, + /// ## Timing Tuning Parameters + /// Insert Pipeline registers into off-loading path (response) + parameter bit RegisterOffloadRsp = 1'b0, + /// Insert Pipeline registers into data memory path (request) + parameter bit RegisterCoreReq = 1'b0, + /// Insert Pipeline registers into data memory path (response) + parameter bit RegisterCoreRsp = 1'b0, + /// Insert Pipeline registers after each memory cut + parameter bit RegisterTCDMCuts = 1'b0, + /// Decouple external AXI plug + parameter bit RegisterExt = 1'b0, + parameter axi_pkg::xbar_latency_e XbarLatency = axi_pkg::CUT_ALL_PORTS, + /// Outstanding transactions on the AXI network + parameter int unsigned MaxMstTrans = 4, + parameter int unsigned MaxSlvTrans = 4, + /// # Interface + /// AXI Ports + parameter type axi_in_req_t = logic, + parameter type axi_in_resp_t = logic, + parameter type axi_out_req_t = logic, + parameter type axi_out_resp_t = logic, + /// SRAM configuration + parameter type impl_in_t = logic, + // Memory latency parameter. Most of the memories have a read latency of 1. In + // case you have memory macros which are pipelined you want to adjust this + // value here. This only applies to the TCDM. The instruction cache macros will break! + // In case you are using the `RegisterTCDMCuts` feature this adds an + // additional cycle latency, which is taken into account here. + parameter int unsigned MemoryMacroLatency = 1 + RegisterTCDMCuts, + /// # SRAM Configuration rules needed: L1D Tag + L1D Data + L1D FIFO + L1I Tag + L1I Data + /*** ATTENTION: `NrSramCfg` should be changed if `L1NumDataBank` and `L1NumTagBank` is changed ***/ + parameter int unsigned NrSramCfg = 1 + ) ( + /// System clock. + input logic clk_i, + /// Asynchronous active high reset. This signal is assumed to be _async_. + input logic rst_ni, + /// Per-core debug request signal. Asserting this signals puts the + /// corresponding core into debug mode. This signal is assumed to be _async_. + input logic [NrCores-1:0] debug_req_i, + /// Machine external interrupt pending. Usually those interrupts come from a + /// platform-level interrupt controller. This signal is assumed to be _async_. + input logic [NrCores-1:0] meip_i, + /// Machine timer interrupt pending. Usually those interrupts come from a + /// core-local interrupt controller such as a timer/RTC. This signal is + /// assumed to be _async_. + input logic [NrCores-1:0] mtip_i, + /// Core software interrupt pending. Usually those interrupts come from + /// another core to facilitate inter-processor-interrupts. This signal is + /// assumed to be _async_. + input logic [NrCores-1:0] msip_i, + /// First hartid of the cluster. Cores of a cluster are monotonically + /// increasing without a gap, i.e., a cluster with 8 cores and a + /// `hart_base_id_i` of 5 get the hartids 5 - 12. + input logic [9:0] hart_base_id_i, + /// Base address of cluster. TCDM and cluster peripheral location are derived from + /// it. This signal is pseudo-static. + input logic [AxiAddrWidth-1:0] cluster_base_addr_i, + /// Per-cluster probe on the cluster status. Can be written by the cores to indicate + /// to the overall system that the cluster is executing something. + output logic tile_probe_o, + /// AXI Core cluster in-port. + input axi_in_req_t axi_in_req_i, + output axi_in_resp_t axi_in_resp_o, + /// AXI Cache Refill ports + output axi_out_req_t [NumL1CacheCtrl-1:0] axi_cache_req_o, + input axi_out_resp_t [NumL1CacheCtrl-1:0] axi_cache_rsp_i, + /// Wide AXI ports to cluster level + output axi_out_req_t [NumTileWideAxi-1:0] axi_wide_req_o, + input axi_out_resp_t [NumTileWideAxi-1:0] axi_wide_rsp_i, + // /// Narrow AXI ports to cluster level (peripheral) + // output axi_in_req_t [NumNarrowAxi-1:0] axi_narrow_req_o, + // input axi_in_resp_t [NumNarrowAxi-1:0] axi_narrow_rsp_i, + /// SRAM Configuration Ports, usually not used. + input impl_in_t [NrSramCfg-1:0] impl_i, + /// Indicate the program execution is error + output logic error_o + ); + // --------- + // Imports + // --------- + import snitch_pkg::*; + import snitch_icache_pkg::icache_events_t; + + // --------- + // Constants + // --------- + /// Minimum width to hold the core number. + localparam int unsigned CoreIDWidth = cf_math_pkg::idx_width(NrCores); + localparam int unsigned TCDMMemAddrWidth = $clog2(TCDMDepth); + localparam int unsigned TCDMSize = NrBanks * TCDMDepth * BeWidth; + // The short address for SPM + localparam int unsigned SPMAddrWidth = $clog2(TCDMSize); + // Enlarge the address width for Spatz due to cache + localparam int unsigned TCDMAddrWidth = 32; + localparam int unsigned BanksPerSuperBank = AxiDataWidth / DataWidth; + localparam int unsigned NrSuperBanks = NrBanks / BanksPerSuperBank; + + function automatic int unsigned get_tcdm_ports(int unsigned core); + return spatz_pkg::N_FU + 1; + endfunction + + function automatic int unsigned get_tcdm_port_offs(int unsigned core_idx); + automatic int n = 0; + for (int i = 0; i < core_idx; i++) n += get_tcdm_ports(i); + return n; + endfunction + + localparam int unsigned NrTCDMPortsPerCore = get_tcdm_ports(0); + localparam int unsigned NrTCDMPortsCores = get_tcdm_port_offs(NrCores); + localparam int unsigned NumTCDMIn = NrTCDMPortsCores + 1; + localparam logic [AxiAddrWidth-1:0] TCDMMask = ~(TCDMSize-1); + + // Core Request, SoC Request + localparam int unsigned NrNarrowMasters = 2; + + // Narrow AXI network parameters + localparam int unsigned NarrowIdWidthIn = AxiIdWidthIn; + localparam int unsigned NarrowIdWidthOut = NarrowIdWidthIn + $clog2(NrNarrowMasters); + localparam int unsigned NarrowDataWidth = ELEN; + localparam int unsigned NarrowUserWidth = AxiUserWidth; + + // TCDM, Peripherals, SoC Request + localparam int unsigned NrNarrowSlaves = 3; + localparam int unsigned NrNarrowRules = NrNarrowSlaves - 1; + + // Core Request, DMA, Instruction cache + localparam int unsigned NrWideMasters = 3; + // localparam int unsigned WideIdWidthOut = AxiIdWidthOut; + // localparam int unsigned WideIdWidthIn = WideIdWidthOut - $clog2(NrWideMasters); + localparam int unsigned WideIdWidthOut = 5; + localparam int unsigned WideIdWidthIn = 3; + // DMA X-BAR configuration + localparam int unsigned NrWideSlaves = 3; + + // AXI Configuration + localparam axi_pkg::xbar_cfg_t ClusterXbarCfg = '{ + NoSlvPorts : NrNarrowMasters, + NoMstPorts : NrNarrowSlaves, + MaxMstTrans : MaxMstTrans, + MaxSlvTrans : MaxSlvTrans, + FallThrough : 1'b0, + LatencyMode : XbarLatency, + AxiIdWidthSlvPorts: NarrowIdWidthIn, + AxiIdUsedSlvPorts : NarrowIdWidthIn, + UniqueIds : 1'b0, + AxiAddrWidth : AxiAddrWidth, + AxiDataWidth : NarrowDataWidth, + NoAddrRules : NrNarrowRules, + default : '0 + }; + + // DMA configuration struct + localparam axi_pkg::xbar_cfg_t DmaXbarCfg = '{ + NoSlvPorts : NrWideMasters, + NoMstPorts : NrWideSlaves, + MaxMstTrans : MaxMstTrans, + MaxSlvTrans : MaxSlvTrans, + FallThrough : 1'b0, + LatencyMode : XbarLatency, + AxiIdWidthSlvPorts: WideIdWidthIn, + AxiIdUsedSlvPorts : WideIdWidthIn, + UniqueIds : 1'b0, + AxiAddrWidth : AxiAddrWidth, + AxiDataWidth : AxiDataWidth, + NoAddrRules : NrWideSlaves - 1, + default : '0 + }; + + // -------- + // Typedefs + // -------- + typedef logic [AxiAddrWidth-1:0] addr_t; + typedef logic [NarrowDataWidth-1:0] data_t; + typedef logic [63:0] tag_data_t; + typedef logic [NarrowDataWidth/8-1:0] strb_t; + typedef logic [AxiDataWidth-1:0] data_dma_t; + typedef logic [AxiDataWidth/8-1:0] strb_dma_t; + typedef logic [NarrowIdWidthIn-1:0] id_mst_t; + typedef logic [NarrowIdWidthOut-1:0] id_slv_t; + typedef logic [WideIdWidthIn-1:0] id_dma_mst_t; + typedef logic [WideIdWidthOut-1:0] id_dma_slv_t; + typedef logic [WideIdWidthIn-$clog2(NumL1CacheCtrl)-1:0] id_dcache_mst_t; + typedef logic [NarrowUserWidth-1:0] user_t; + typedef logic [AxiUserWidth-1:0] user_dma_t; + + typedef logic [TCDMMemAddrWidth-1:0] tcdm_mem_addr_t; + typedef logic [TCDMAddrWidth-1:0] tcdm_addr_t; + typedef logic [SPMAddrWidth-1:0] spm_addr_t; + + typedef logic [$clog2(NumSpatzOutstandingLoads[0])-1:0] reqid_t; + + typedef logic [$clog2(L1NumSet)-1:0] tcdm_bank_addr_t; + + typedef struct packed { + logic [CoreIDWidth-1:0] core_id; + logic is_core; + logic is_amo; + reqid_t req_id; + } tcdm_user_t; + + // The metadata type used to restore the information from req to rsp + typedef struct packed { + tcdm_user_t user; + logic write; + } tcdm_meta_t; + + + // Regbus peripherals. + `AXI_TYPEDEF_ALL(axi_mst, addr_t, id_mst_t, data_t, strb_t, user_t) + `AXI_TYPEDEF_ALL(axi_slv, addr_t, id_slv_t, data_t, strb_t, user_t) + `AXI_TYPEDEF_ALL(axi_mst_dma, addr_t, id_dma_mst_t, data_dma_t, strb_dma_t, user_dma_t) + `AXI_TYPEDEF_ALL(axi_slv_dma, addr_t, id_dma_slv_t, data_dma_t, strb_dma_t, user_dma_t) + `AXI_TYPEDEF_ALL(axi_dcache, addr_t, id_dcache_mst_t, data_dma_t, strb_dma_t, user_dma_t) + + `REQRSP_TYPEDEF_ALL(reqrsp, addr_t, data_t, strb_t) + + `MEM_TYPEDEF_ALL(mem, tcdm_mem_addr_t, data_t, strb_t, tcdm_user_t) + `MEM_TYPEDEF_ALL(mem_dma, tcdm_mem_addr_t, data_dma_t, strb_dma_t, logic) + + `TCDM_TYPEDEF_ALL(tcdm, tcdm_addr_t, data_t, strb_t, tcdm_user_t) + `TCDM_TYPEDEF_ALL(tcdm_dma, tcdm_addr_t, data_dma_t, strb_dma_t, logic) + `TCDM_TYPEDEF_ALL(spm, spm_addr_t, data_t, strb_t, tcdm_user_t) + + `REG_BUS_TYPEDEF_ALL(reg, addr_t, data_t, strb_t) + `REG_BUS_TYPEDEF_ALL(reg_dma, addr_t, data_dma_t, strb_dma_t) + + // Event counter increments for the TCDM. + typedef struct packed { + /// Number requests going in + logic [$clog2(NrTCDMPortsCores):0] inc_accessed; + /// Number of requests stalled due to congestion + logic [$clog2(NrTCDMPortsCores):0] inc_congested; + } tcdm_events_t; + + // Event counter increments for DMA. + typedef struct packed { + logic aw_stall, ar_stall, r_stall, w_stall, + buf_w_stall, buf_r_stall; + logic aw_valid, aw_ready, aw_done, aw_bw; + logic ar_valid, ar_ready, ar_done, ar_bw; + logic r_valid, r_ready, r_done, r_bw; + logic w_valid, w_ready, w_done, w_bw; + logic b_valid, b_ready, b_done; + logic dma_busy; + axi_pkg::len_t aw_len, ar_len; + axi_pkg::size_t aw_size, ar_size; + logic [$clog2(AxiDataWidth/8):0] num_bytes_written; + } dma_events_t; + + typedef struct packed { + int unsigned idx; + addr_t start_addr; + addr_t end_addr; + } xbar_rule_t; + + typedef struct packed { + acc_addr_e addr; + logic [5:0] id; + logic [31:0] data_op; + data_t data_arga; + data_t data_argb; + addr_t data_argc; + } acc_issue_req_t; + + typedef struct packed { + logic accept; + logic writeback; + logic loadstore; + logic exception; + logic isfloat; + } acc_issue_rsp_t; + + typedef struct packed { + logic [5:0] id; + logic error; + data_t data; + } acc_rsp_t; + + `SNITCH_VM_TYPEDEF(AxiAddrWidth) + + typedef struct packed { + // Slow domain. + logic flush_i_valid; + addr_t inst_addr; + logic inst_cacheable; + logic inst_valid; + // Fast domain. + acc_issue_req_t acc_req; + logic acc_qvalid; + logic acc_pready; + // Slow domain. + logic [1:0] ptw_valid; + va_t [1:0] ptw_va; + pa_t [1:0] ptw_ppn; + } hive_req_t; + + typedef struct packed { + // Slow domain. + logic flush_i_ready; + logic [31:0] inst_data; + logic inst_ready; + logic inst_error; + // Fast domain. + logic acc_qready; + acc_rsp_t acc_resp; + logic acc_pvalid; + // Slow domain. + logic [1:0] ptw_ready; + l0_pte_t [1:0] ptw_pte; + logic [1:0] ptw_is_4mega; + } hive_rsp_t; + + // ----------- + // Assignments + // ----------- + // Calculate start and end address of TCDM based on the `cluster_base_addr_i`. + addr_t tcdm_start_address, tcdm_end_address; + assign tcdm_start_address = (cluster_base_addr_i & TCDMMask); + assign tcdm_end_address = (tcdm_start_address + TCDMSize) & TCDMMask; + + addr_t cluster_periph_start_address, cluster_periph_end_address; + assign cluster_periph_start_address = tcdm_end_address; + assign cluster_periph_end_address = tcdm_end_address + ClusterPeriphSize * 1024; + + localparam int unsigned ClusterReserve = 4096; // 4 MiB + localparam int unsigned ClusterL2Size = 8192; // 8 MiB + addr_t cluster_l2_start_address, cluster_l2_end_address; + assign cluster_l2_start_address = L2Addr; + assign cluster_l2_end_address = L2Addr + L2Size; + + // ---------------- + // Wire Definitions + // ---------------- + // 1. AXI + axi_slv_req_t [NrNarrowSlaves-1:0] narrow_axi_slv_req; + axi_slv_resp_t [NrNarrowSlaves-1:0] narrow_axi_slv_rsp; + axi_mst_req_t [NrNarrowMasters-1:0] narrow_axi_mst_req; + axi_mst_resp_t [NrNarrowMasters-1:0] narrow_axi_mst_rsp; + + // DMA AXI buses + axi_mst_dma_req_t [NrWideMasters-1:0] wide_axi_mst_req; + axi_mst_dma_resp_t [NrWideMasters-1:0] wide_axi_mst_rsp; + axi_slv_dma_req_t [NrWideSlaves-1 :0] wide_axi_slv_req; + axi_slv_dma_resp_t [NrWideSlaves-1 :0] wide_axi_slv_rsp; + + axi_out_req_t [NumL1CacheCtrl-1:0] axi_cache_req_prescrambled; + + // 2. Memory Subsystem (Banks) + mem_req_t [NrSuperBanks-1:0][BanksPerSuperBank-1:0] ic_req; + mem_rsp_t [NrSuperBanks-1:0][BanksPerSuperBank-1:0] ic_rsp; + + mem_dma_req_t [NrSuperBanks-1:0] sb_dma_req; + mem_dma_rsp_t [NrSuperBanks-1:0] sb_dma_rsp; + + // 3. Memory Subsystem (Interconnect) + tcdm_dma_req_t ext_dma_req; + tcdm_dma_rsp_t ext_dma_rsp; + + // AXI Ports into TCDM (from SoC). + spm_req_t axi_soc_req; + spm_rsp_t axi_soc_rsp; + + tcdm_req_t [NrTCDMPortsCores-1:0] tcdm_req; + tcdm_rsp_t [NrTCDMPortsCores-1:0] tcdm_rsp; + + core_events_t [NrCores-1:0] core_events; + tcdm_events_t tcdm_events; + dma_events_t dma_events; + snitch_icache_pkg::icache_events_t [NrCores-1:0] icache_events; + + // 4. Memory Subsystem (Core side). + reqrsp_req_t [NrCores-1:0] core_req, filtered_core_req; + reqrsp_rsp_t [NrCores-1:0] core_rsp, filtered_core_rsp; + + // 5. Peripheral Subsystem + reg_req_t reg_req; + reg_rsp_t reg_rsp; + + // 7. Misc. Wires. + logic icache_prefetch_enable; + logic [NrCores-1:0] cl_interrupt; + + // 8. L1 D$ + spm_req_t [NrTCDMPortsCores-1:0] spm_req; + spm_rsp_t [NrTCDMPortsCores-1:0] spm_rsp; + + tcdm_req_t [NrTCDMPortsCores-1:0] unmerge_req; + tcdm_rsp_t [NrTCDMPortsCores-1:0] unmerge_rsp; + + tcdm_req_t [NrTCDMPortsPerCore-1:0][NumL1CacheCtrl-1:0] cache_req, cache_xbar_req, cache_amo_req; + tcdm_rsp_t [NrTCDMPortsPerCore-1:0][NumL1CacheCtrl-1:0] cache_rsp, cache_xbar_rsp, cache_amo_rsp; + + logic [NumL1CacheCtrl-1:0][NrTCDMPortsPerCore-1:0] cache_req_valid; + logic [NumL1CacheCtrl-1:0][NrTCDMPortsPerCore-1:0] cache_req_ready; + tcdm_addr_t [NumL1CacheCtrl-1:0][NrTCDMPortsPerCore-1:0] cache_req_addr; + tcdm_user_t [NumL1CacheCtrl-1:0][NrTCDMPortsPerCore-1:0] cache_req_meta; + logic [NumL1CacheCtrl-1:0][NrTCDMPortsPerCore-1:0] cache_req_write; + data_t [NumL1CacheCtrl-1:0][NrTCDMPortsPerCore-1:0] cache_req_data; + + logic [NumL1CacheCtrl-1:0][NrTCDMPortsPerCore-1:0] cache_rsp_valid; + logic [NumL1CacheCtrl-1:0][NrTCDMPortsPerCore-1:0] cache_rsp_ready; + logic [NumL1CacheCtrl-1:0][NrTCDMPortsPerCore-1:0] cache_rsp_write; + data_t [NumL1CacheCtrl-1:0][NrTCDMPortsPerCore-1:0] cache_rsp_data; + tcdm_user_t [NumL1CacheCtrl-1:0][NrTCDMPortsPerCore-1:0] cache_rsp_meta; + + logic [NumL1CacheCtrl-1:0][NumTagBankPerCtrl-1:0] l1_tag_bank_req; + logic [NumL1CacheCtrl-1:0][NumTagBankPerCtrl-1:0] l1_tag_bank_we; + tcdm_bank_addr_t [NumL1CacheCtrl-1:0][NumTagBankPerCtrl-1:0] l1_tag_bank_addr; + tag_data_t [NumL1CacheCtrl-1:0][NumTagBankPerCtrl-1:0] l1_tag_bank_wdata; + logic [NumL1CacheCtrl-1:0][NumTagBankPerCtrl-1:0] l1_tag_bank_be; + tag_data_t [NumL1CacheCtrl-1:0][NumTagBankPerCtrl-1:0] l1_tag_bank_rdata; + + logic [NumL1CacheCtrl-1:0][NumDataBankPerCtrl-1:0] l1_data_bank_req; + logic [NumL1CacheCtrl-1:0][NumDataBankPerCtrl-1:0] l1_data_bank_we; + tcdm_bank_addr_t [NumL1CacheCtrl-1:0][NumDataBankPerCtrl-1:0] l1_data_bank_addr; + data_t [NumL1CacheCtrl-1:0][NumDataBankPerCtrl-1:0] l1_data_bank_wdata; + logic [NumL1CacheCtrl-1:0][NumDataBankPerCtrl-1:0] l1_data_bank_be; + data_t [NumL1CacheCtrl-1:0][NumDataBankPerCtrl-1:0] l1_data_bank_rdata; + logic [NumL1CacheCtrl-1:0][NumDataBankPerCtrl-1:0] l1_data_bank_gnt; + + logic l1d_insn_valid; + logic [NumL1CacheCtrl-1:0] l1d_insn_ready; + logic [1:0] l1d_insn; + tcdm_bank_addr_t cfg_spm_size; + logic l1d_busy; + + // High if a port access an illegal SPM region (mapped to cache) + logic [NrTCDMPortsCores-1:0] spm_error; + + + // 9. SRAM Configuration + // impl_in_t [L1NumWrapper-1:0][L1BankPerWP-1:0] impl_l1d_data; + // impl_in_t [L1NumTagBank-1:0] impl_l1d_tag; + // impl_in_t [1:0] impl_l1d_fifo; + + // impl_in_t [ICacheSets-1:0] impl_l1i_data; + // impl_in_t [ICacheSets-1:0] impl_l1i_tag; + + // assign {impl_l1d_data, impl_l1d_tag, impl_l1d_fifo, impl_l1i_data, impl_l1i_tag} = impl_i; + assign error_o = |spm_error; + + + // ------------- + // DMA Subsystem + // ------------- + // Optionally decouple the external wide AXI master port. + + assign axi_wide_req_o[TileMem] = wide_axi_slv_req[SoCDMAOut]; + assign wide_axi_slv_rsp[SoCDMAOut] = axi_wide_rsp_i[TileMem]; + + + axi_cut #( + .Bypass (!RegisterExt ), + .aw_chan_t (axi_mst_aw_chan_t), + .w_chan_t (axi_mst_w_chan_t ), + .b_chan_t (axi_mst_b_chan_t ), + .ar_chan_t (axi_mst_ar_chan_t), + .r_chan_t (axi_mst_r_chan_t ), + .axi_req_t (axi_mst_req_t ), + .axi_resp_t (axi_mst_resp_t ) + ) i_cut_ext_narrow_in ( + .clk_i (clk_i ), + .rst_ni (rst_ni ), + .slv_req_i (axi_in_req_i ), + .slv_resp_o (axi_in_resp_o ), + .mst_req_o (narrow_axi_mst_req[SoCDMAIn]), + .mst_resp_i (narrow_axi_mst_rsp[SoCDMAIn]) + ); + + logic [DmaXbarCfg.NoSlvPorts-1:0][$clog2(DmaXbarCfg.NoMstPorts)-1:0] dma_xbar_default_port; + xbar_rule_t [DmaXbarCfg.NoAddrRules-1:0] dma_xbar_rule; + + // Diyou: DMA Xbar move to cluster level + + assign dma_xbar_default_port = '{default: SoCDMAOut}; + assign dma_xbar_rule = '{ + '{ + idx : TCDMDMA, + start_addr: tcdm_start_address, + end_addr : tcdm_end_address + }, + '{ + idx : BootROM, + start_addr: BootAddr, + end_addr : BootAddr + 'h1000 + } + }; + + localparam bit [DmaXbarCfg.NoSlvPorts-1:0] DMAEnableDefaultMstPort = '1; + axi_xbar #( + .Cfg (DmaXbarCfg ), + .ATOPs (0 ), + .slv_aw_chan_t (axi_mst_dma_aw_chan_t), + .mst_aw_chan_t (axi_slv_dma_aw_chan_t), + .w_chan_t (axi_mst_dma_w_chan_t ), + .slv_b_chan_t (axi_mst_dma_b_chan_t ), + .mst_b_chan_t (axi_slv_dma_b_chan_t ), + .slv_ar_chan_t (axi_mst_dma_ar_chan_t), + .mst_ar_chan_t (axi_slv_dma_ar_chan_t), + .slv_r_chan_t (axi_mst_dma_r_chan_t ), + .mst_r_chan_t (axi_slv_dma_r_chan_t ), + .slv_req_t (axi_mst_dma_req_t ), + .slv_resp_t (axi_mst_dma_resp_t ), + .mst_req_t (axi_slv_dma_req_t ), + .mst_resp_t (axi_slv_dma_resp_t ), + .rule_t (xbar_rule_t ) + ) i_axi_dma_xbar ( + .clk_i (clk_i ), + .rst_ni (rst_ni ), + .test_i (1'b0 ), + .slv_ports_req_i (wide_axi_mst_req ), + .slv_ports_resp_o (wide_axi_mst_rsp ), + .mst_ports_req_o (wide_axi_slv_req ), + .mst_ports_resp_i (wide_axi_slv_rsp ), + .addr_map_i (dma_xbar_rule ), + .en_default_mst_port_i (DMAEnableDefaultMstPort), + .default_mst_port_i (dma_xbar_default_port ) + ); + + addr_t ext_dma_req_q_addr_nontrunc; + + axi_to_mem_interleaved #( + .axi_req_t (axi_slv_dma_req_t ), + .axi_resp_t (axi_slv_dma_resp_t ), + .AddrWidth (AxiAddrWidth ), + .DataWidth (AxiDataWidth ), + .IdWidth (WideIdWidthOut ), + .NumBanks (1 ), + .BufDepth (MemoryMacroLatency + 1) + ) i_axi_to_mem_dma ( + .clk_i (clk_i ), + .rst_ni (rst_ni ), + .busy_o (/* Unused */ ), + .test_i (1'b0 ), + .axi_req_i (wide_axi_slv_req[TCDMDMA] ), + .axi_resp_o (wide_axi_slv_rsp[TCDMDMA] ), + .mem_req_o (ext_dma_req.q_valid ), + .mem_gnt_i (ext_dma_rsp.q_ready ), + .mem_addr_o (ext_dma_req_q_addr_nontrunc ), + .mem_wdata_o (ext_dma_req.q.data ), + .mem_strb_o (ext_dma_req.q.strb ), + .mem_atop_o (/* The DMA does not support atomics */), + .mem_we_o (ext_dma_req.q.write ), + .mem_rvalid_i (ext_dma_rsp.p_valid ), + .mem_rdata_i (ext_dma_rsp.p.data ) + ); + + assign ext_dma_req.q.addr = tcdm_addr_t'(ext_dma_req_q_addr_nontrunc); + assign ext_dma_req.q.amo = reqrsp_pkg::AMONone; + assign ext_dma_req.q.user = '0; + + spatz_tcdm_interconnect #( + .NumInp (1 ), + .NumOut (NrSuperBanks ), + .tcdm_req_t (tcdm_dma_req_t ), + .tcdm_rsp_t (tcdm_dma_rsp_t ), + .mem_req_t (mem_dma_req_t ), + .mem_rsp_t (mem_dma_rsp_t ), + .user_t (logic ), + .MemAddrWidth (TCDMMemAddrWidth ), + .DataWidth (AxiDataWidth ), + .MemoryResponseLatency (MemoryMacroLatency) + ) i_dma_interconnect ( + .clk_i (clk_i ), + .rst_ni (rst_ni ), + .req_i (ext_dma_req), + .rsp_o (ext_dma_rsp), + .mem_req_o (sb_dma_req ), + .mem_rsp_i (sb_dma_rsp ) + ); + + // ---------------- + // Memory Subsystem + // ---------------- + for (genvar i = 0; i < NrSuperBanks; i++) begin : gen_tcdm_super_bank + + mem_req_t [BanksPerSuperBank-1:0] amo_req; + mem_rsp_t [BanksPerSuperBank-1:0] amo_rsp; + + logic [BanksPerSuperBank-1:0] mem_cs, mem_wen; + tcdm_mem_addr_t [BanksPerSuperBank-1:0] mem_add; + tcdm_mem_addr_t [BanksPerSuperBank-1:0] mem_add_max; + strb_t [BanksPerSuperBank-1:0] mem_be; + data_t [BanksPerSuperBank-1:0] mem_rdata, mem_wdata; + tcdm_meta_t [BanksPerSuperBank-1:0] bank_req_meta, mem_req_meta, bank_rsp_meta; + + mem_wide_narrow_mux #( + .NarrowDataWidth (NarrowDataWidth), + .WideDataWidth (AxiDataWidth ), + .mem_narrow_req_t (mem_req_t ), + .mem_narrow_rsp_t (mem_rsp_t ), + .mem_wide_req_t (mem_dma_req_t ), + .mem_wide_rsp_t (mem_dma_rsp_t ) + ) i_tcdm_mux ( + .clk_i (clk_i ), + .rst_ni (rst_ni ), + .in_narrow_req_i (ic_req [i] ), + .in_narrow_rsp_o (ic_rsp [i] ), + .in_wide_req_i (sb_dma_req [i] ), + .in_wide_rsp_o (sb_dma_rsp [i] ), + .out_req_o (amo_req ), + .out_rsp_i (amo_rsp ), + .sel_wide_i (sb_dma_req[i].q_valid) + ); + + // generate banks of the superbank + for (genvar j = 0; j < BanksPerSuperBank; j++) begin : gen_tcdm_bank + tc_sram_impl #( + .NumWords (TCDMDepth), + .DataWidth (DataWidth), + .ByteWidth (8 ), + .NumPorts (1 ), + .Latency (1 ), + .SimInit ("zeros" ) + ) i_spm_mem ( + .clk_i (clk_i ), + .rst_ni (rst_ni ), + .impl_i ('0 ), + .impl_o (/* Unused */), + .req_i (mem_cs[j] ), + .we_i (mem_wen[j] ), + .addr_i (mem_add[j] ), + .wdata_i (mem_wdata[j] ), + .be_i (mem_be[j] ), + .rdata_o (mem_rdata[j] ) + ); + + data_t amo_rdata_local; + + // TODO(zarubaf): Share atomic units between mutltiple cuts + snitch_amo_shim #( + .AddrMemWidth ( TCDMMemAddrWidth ), + .DataWidth ( DataWidth ), + .CoreIDWidth ( CoreIDWidth ) + ) i_amo_shim ( + .clk_i (clk_i ), + .rst_ni (rst_ni ), + .valid_i (amo_req[j].q_valid ), + .ready_o (amo_rsp[j].q_ready ), + .addr_i (amo_req[j].q.addr ), + .write_i (amo_req[j].q.write ), + .wdata_i (amo_req[j].q.data ), + .wstrb_i (amo_req[j].q.strb ), + .core_id_i (amo_req[j].q.user.core_id ), + .is_core_i (amo_req[j].q.user.is_core ), + .rdata_o (amo_rdata_local ), + .amo_i (amo_req[j].q.amo ), + .mem_req_o (mem_cs[j] ), + .mem_add_o (mem_add[j] ), + .mem_wen_o (mem_wen[j] ), + .mem_wdata_o (mem_wdata[j] ), + .mem_be_o (mem_be[j] ), + .mem_rdata_i (mem_rdata[j] ), + .dma_access_i (sb_dma_req[i].q_valid ), + // TODO(zarubaf): Signal AMO conflict somewhere. Socregs? + .amo_conflict_o (/* Unused */ ) + ); + + // Insert a pipeline register at the output of each SRAM. + shift_reg #( + .dtype(data_t ), + .Depth(int'(RegisterTCDMCuts)) + ) i_sram_pipe ( + .clk_i (clk_i ), + .rst_ni(rst_ni ), + .d_i (amo_rdata_local ), + .d_o (amo_rsp[j].p.data) + ); + + // the meta data information + assign bank_req_meta[j] = '{ + user: amo_req[j].q.user, + write: amo_req[j].q.write, + default: '0 + }; + assign amo_rsp[j].p.user = bank_rsp_meta[j].user; + assign amo_rsp[j].p.write = bank_rsp_meta[j].write; + + shift_reg #( + .dtype(tcdm_meta_t ), + .Depth(int'(RegisterTCDMCuts)) + ) i_req_meta_pipe ( + .clk_i (clk_i ), + .rst_ni(rst_ni ), + .d_i (bank_req_meta[j] ), + .d_o (mem_req_meta[j] ) + ); + shift_reg #( + .dtype(tcdm_meta_t ), + .Depth(int'(RegisterTCDMCuts)) + ) i_rsp_meta_pipe ( + .clk_i (clk_i ), + .rst_ni(rst_ni ), + .d_i (mem_req_meta[j] ), + .d_o (bank_rsp_meta[j] ) + ); + end + end + + logic [NrTCDMPortsCores-1:0] unmerge_pready; + logic [NrTCDMPortsPerCore-1:0][NumL1CacheCtrl-1:0] cache_pready, cache_xbar_pready, cache_amo_pready; + + // split the requests for spm or cache from core side + spatz_addr_mapper #( + .NumIO (NrTCDMPortsCores ), + .AddrWidth (L1AddrWidth ), + .SPMAddrWidth (SPMAddrWidth ), + .DataWidth (DataWidth ), + .mem_req_t (tcdm_req_t ), + .mem_rsp_t (tcdm_rsp_t ), + .mem_rsp_chan_t (tcdm_rsp_chan_t ), + .spm_req_t (spm_req_t ), + .spm_rsp_t (spm_rsp_t ) + ) i_tcdm_mapper ( + .clk_i (clk_i ), + .rst_ni (rst_ni ), + // Input + .mem_req_i (tcdm_req ), + .mem_rsp_o (tcdm_rsp ), + .error_o (spm_error ), + // Address + .tcdm_start_address_i (tcdm_start_address[L1AddrWidth-1:0] ), + .tcdm_end_address_i (tcdm_end_address[L1AddrWidth-1:0] ), + .spm_size_i (tcdm_end_address[L1AddrWidth-1:0] - tcdm_start_address[L1AddrWidth-1:0]), + .flush_i (l1d_busy ), + // Output + .spm_req_o (spm_req ), + .spm_rsp_i (spm_rsp ), + .cache_req_o (unmerge_req ), + .cache_pready_o (unmerge_pready ), + .cache_rsp_i (unmerge_rsp ) + ); + + for (genvar j = 0; j < NrTCDMPortsPerCore; j++) begin + for (genvar cb = 0; cb < NumL1CacheCtrl; cb++) begin + assign cache_req [j][cb] = unmerge_req [cb*NrTCDMPortsPerCore+j]; + assign cache_pready[j][cb] = unmerge_pready[cb*NrTCDMPortsPerCore+j]; + assign unmerge_rsp [cb*NrTCDMPortsPerCore+j] = cache_rsp [j][cb]; + end + end + + // Used to determine the mapping policy between different cache banks. + // Set through CSR + logic [$clog2(SpatzAxiAddrWidth)-1:0] dynamic_offset; + + /// Wire requests after strb handling to the cache controller + for (genvar j = 0; j < NrTCDMPortsPerCore; j++) begin : gen_cache_xbar + tcdm_cache_interco #( + .NumCore (NrCores ), + .NumCache (NumL1CacheCtrl ), + .AddrWidth (SpatzAxiAddrWidth ), + .tcdm_req_t (tcdm_req_t ), + .tcdm_rsp_t (tcdm_rsp_t ), + .tcdm_req_chan_t (tcdm_req_chan_t ), + .tcdm_rsp_chan_t (tcdm_rsp_chan_t ) + ) i_cache_xbar ( + .clk_i (clk_i ), + .rst_ni (rst_ni ), + .dynamic_offset_i (dynamic_offset ), + .core_req_i (cache_req [j] ), + .core_rsp_ready_i (cache_pready [j] ), + .core_rsp_o (cache_rsp [j] ), + .mem_req_o (cache_xbar_req [j] ), + .mem_rsp_ready_o (cache_xbar_pready[j] ), + .mem_rsp_i (cache_xbar_rsp [j] ) + ); + end + + for (genvar cb = 0; cb < NumL1CacheCtrl; cb++) begin : gen_cache_connect + for (genvar j = 0; j < NrTCDMPortsPerCore; j++) begin : gen_cache_amo + spatz_cache_amo #( + .DataWidth ( DataWidth ), + .CoreIDWidth ( CoreIDWidth ), + .tcdm_req_t ( tcdm_req_t ), + .tcdm_rsp_t ( tcdm_rsp_t ), + .tcdm_req_chan_t ( tcdm_req_chan_t ), + .tcdm_rsp_chan_t ( tcdm_rsp_chan_t ), + .tcdm_user_t ( tcdm_user_t ) + ) i_cache_amo ( + .clk_i (clk_i ), + .rst_ni (rst_ni ), + .core_req_i (cache_xbar_req [j][cb] ), + .core_rsp_ready_i (cache_xbar_pready[j][cb] ), + .core_rsp_o (cache_xbar_rsp [j][cb] ), + .mem_req_o (cache_amo_req [j][cb] ), + .mem_rsp_ready_o (cache_amo_pready [j][cb] ), + .mem_rsp_i (cache_amo_rsp [j][cb] ) + ); + assign cache_req_valid[cb][j] = cache_amo_req[j][cb].q_valid; + assign cache_req_addr [cb][j] = cache_amo_req[j][cb].q.addr; + assign cache_req_meta [cb][j] = cache_amo_req[j][cb].q.user; + assign cache_req_write[cb][j] = cache_amo_req[j][cb].q.write; + assign cache_req_data [cb][j] = cache_amo_req[j][cb].q.data; + + // assign cache_rsp_ready[cb][j] = 1'b1; + assign cache_rsp_ready[cb][j] = cache_amo_pready[j][cb]; + + assign cache_amo_rsp[j][cb].p_valid = cache_rsp_valid[cb][j]; + assign cache_amo_rsp[j][cb].q_ready = cache_req_ready[cb][j]; + assign cache_amo_rsp[j][cb].p.data = cache_rsp_data [cb][j]; + assign cache_amo_rsp[j][cb].p.user = cache_rsp_meta [cb][j]; + + assign cache_amo_rsp[j][cb].p.write = cache_rsp_write[cb][j]; + end + end + + // TODO: remove + tcdm_bank_addr_t num_spm_lines; + assign num_spm_lines = cfg_spm_size * (DepthPerBank / L1Size); + + // For address scrambling + localparam NumSelBits = $clog2(NumL1CacheCtrl); + logic [SpatzAxiAddrWidth-1:0] bitmask_up, bitmask_lo; + assign bitmask_lo = (1 << dynamic_offset) - 1; + // We will keep AddrWidth - Offset - log2(CacheBanks) bits in the upper half, and add back the NumSelBits bits + assign bitmask_up = ((1 << (SpatzAxiAddrWidth - dynamic_offset - NumSelBits)) - 1) << (dynamic_offset); + + + for (genvar cb = 0; cb < NumL1CacheCtrl; cb++) begin: gen_l1_cache_ctrl + flamingo_spatz_cache_ctrl #( + // Core + .NumPorts (NrTCDMPortsPerCore ), + .CoalExtFactor (L1CoalFactor ), + .AddrWidth (L1AddrWidth ), + .WordWidth (DataWidth ), + // Cache + .NumCacheEntry (L1NumEntryPerCtrl ), + .CacheLineWidth (L1LineWidth ), + .SetAssociativity (L1AssoPerCtrl ), + .BankFactor (L1BankFactor ), + // Type + .core_meta_t (tcdm_user_t ), + .impl_in_t (impl_in_t ), + .axi_req_t (axi_out_req_t ), + .axi_resp_t (axi_out_resp_t ) + ) i_l1_controller ( + .clk_i (clk_i ), + .rst_ni (rst_ni ), + .impl_i ('0 ), + // Sync Control + .cache_sync_valid_i (l1d_insn_valid ), + .cache_sync_ready_o (l1d_insn_ready[cb] ), + .cache_sync_insn_i (l1d_insn ), + // SPM Size + // The calculation of spm region in cache is different + // than other modules (needs to times 2) + // Currently assume full cache + .bank_depth_for_SPM_i ('0 ), + // Request + .core_req_valid_i (cache_req_valid[cb] ), + .core_req_ready_o (cache_req_ready[cb] ), + .core_req_addr_i (cache_req_addr[cb] ), + .core_req_meta_i (cache_req_meta[cb] ), + .core_req_write_i (cache_req_write[cb] ), + .core_req_wdata_i (cache_req_data[cb] ), + // Response + .core_resp_valid_o (cache_rsp_valid[cb] ), + .core_resp_ready_i (cache_rsp_ready[cb] ), + .core_resp_write_o (cache_rsp_write[cb] ), + .core_resp_data_o (cache_rsp_data[cb] ), + .core_resp_meta_o (cache_rsp_meta[cb] ), + // AXI refill + .axi_req_o (axi_cache_req_prescrambled[cb] ), + .axi_resp_i (axi_cache_rsp_i[cb] ), + // Tag Banks + .tcdm_tag_bank_req_o (l1_tag_bank_req[cb] ), + .tcdm_tag_bank_we_o (l1_tag_bank_we[cb] ), + .tcdm_tag_bank_addr_o (l1_tag_bank_addr[cb] ), + .tcdm_tag_bank_wdata_o (l1_tag_bank_wdata[cb] ), + .tcdm_tag_bank_be_o (l1_tag_bank_be[cb] ), + .tcdm_tag_bank_rdata_i (l1_tag_bank_rdata[cb] ), + // Data Banks + .tcdm_data_bank_req_o (l1_data_bank_req[cb] ), + .tcdm_data_bank_we_o (l1_data_bank_we[cb] ), + .tcdm_data_bank_addr_o (l1_data_bank_addr[cb] ), + .tcdm_data_bank_wdata_o(l1_data_bank_wdata[cb] ), + .tcdm_data_bank_be_o (l1_data_bank_be[cb] ), + .tcdm_data_bank_rdata_i(l1_data_bank_rdata[cb] ), + .tcdm_data_bank_gnt_i (l1_data_bank_gnt[cb] ) + ); + + always_comb begin : bank_addr_scramble + axi_cache_req_o[cb] = axi_cache_req_prescrambled[cb]; + // Pass the lower bits first + axi_cache_req_o[cb].ar.addr = axi_cache_req_prescrambled[cb].ar.addr & bitmask_lo; + // Shift the upper part to its location + axi_cache_req_o[cb].ar.addr |= ((axi_cache_req_prescrambled[cb].ar.addr & bitmask_up) << NumSelBits); + // Add back the removed cache bank ID + axi_cache_req_o[cb].ar.addr |= (cb << dynamic_offset); + end + + for (genvar j = 0; j < NumTagBankPerCtrl; j++) begin + tc_sram_impl #( + .NumWords (L1CacheWayEntry/L1BankFactor), + .DataWidth ($bits(tag_data_t) ), + .ByteWidth ($bits(tag_data_t) ), + .NumPorts (1 ), + .Latency (1 ), + .SimInit ("zeros" ), + .impl_in_t (impl_in_t ) + ) i_meta_bank ( + .clk_i (clk_i ), + .rst_ni (rst_ni ), + .impl_i ('0 ), + .impl_o (/* unsed */ ), + .req_i (l1_tag_bank_req [cb][j]), + .we_i (l1_tag_bank_we [cb][j]), + .addr_i (l1_tag_bank_addr [cb][j]), + .wdata_i(l1_tag_bank_wdata[cb][j]), + .be_i (l1_tag_bank_be [cb][j]), + .rdata_o(l1_tag_bank_rdata[cb][j]) + ); + end + + for (genvar j = 0; j < NumDataBankPerCtrl; j = j+4) begin : gen_l1_data_banks + tc_sram_impl #( + .NumWords (L1CacheWayEntry/L1BankFactor), + .DataWidth (DataWidth*4), + .ByteWidth (DataWidth*4), + .NumPorts (1), + .Latency (1), + .SimInit ("zeros") + ) i_data_bank ( + .clk_i (clk_i ), + .rst_ni (rst_ni ), + .impl_i ('0 ), + .impl_o (/* unsed */ ), + .req_i ( l1_data_bank_req [cb][j]), + .we_i ( l1_data_bank_we [cb][j]), + .addr_i ( l1_data_bank_addr [cb][j]), + .wdata_i({l1_data_bank_wdata[cb][j+3], l1_data_bank_wdata[cb][j+2], l1_data_bank_wdata[cb][j+1], l1_data_bank_wdata[cb][j]}), + .be_i ( l1_data_bank_be [cb][j] ), + .rdata_o({l1_data_bank_rdata[cb][j+3], l1_data_bank_rdata[cb][j+2], l1_data_bank_rdata[cb][j+1], l1_data_bank_rdata[cb][j]}) + ); + + assign l1_data_bank_gnt[cb][j] = 1'b1; + assign l1_data_bank_gnt[cb][j+1] = 1'b1; + assign l1_data_bank_gnt[cb][j+2] = 1'b1; + assign l1_data_bank_gnt[cb][j+3] = 1'b1; + end + + // for (genvar j = 0; j < NumDataBankPerCtrl; j++) begin : gen_l1_data_banks + // tc_sram_impl #( + // .NumWords (L1CacheWayEntry/L1BankFactor), + // .DataWidth (DataWidth), + // .ByteWidth (DataWidth), + // .NumPorts (1), + // .Latency (1), + // .SimInit ("zeros") + // ) i_data_bank ( + // .clk_i (clk_i ), + // .rst_ni (rst_ni ), + // .impl_i ('0 ), + // .impl_o (/* unsed */ ), + // .req_i (l1_data_bank_req [cb][j]), + // .we_i (l1_data_bank_we [cb][j]), + // .addr_i (l1_data_bank_addr [cb][j]), + // .wdata_i(l1_data_bank_wdata[cb][j]), + // .be_i (l1_data_bank_be [cb][j]), + // .rdata_o(l1_data_bank_rdata[cb][j]) + // ); + + // assign l1_data_bank_gnt[cb][j] = 1'b1; + // end + end + + spatz_tcdm_interconnect #( + .NumInp (NumTCDMIn ), + .NumOut (L1NumWrapper ), + .tcdm_req_t (spm_req_t ), + .tcdm_rsp_t (spm_rsp_t ), + .mem_req_t (mem_req_t ), + .mem_rsp_t (mem_rsp_t ), + .MemAddrWidth (TCDMMemAddrWidth ), + .DataWidth (DataWidth ), + .user_t (tcdm_user_t ), + .MemoryResponseLatency (1 + RegisterTCDMCuts) + ) i_tcdm_interconnect ( + .clk_i (clk_i ), + .rst_ni (rst_ni ), + .req_i ({axi_soc_req, spm_req} ), + .rsp_o ({axi_soc_rsp, spm_rsp} ), + .mem_req_o (ic_req ), + .mem_rsp_i (ic_rsp ) + ); + + hive_req_t [NrCores-1:0] hive_req; + hive_rsp_t [NrCores-1:0] hive_rsp; + + for (genvar i = 0; i < NrCores; i++) begin : gen_core + localparam int unsigned TcdmPorts = get_tcdm_ports(i); + localparam int unsigned TcdmPortsOffs = get_tcdm_port_offs(i); + + axi_mst_dma_req_t axi_dma_req; + axi_mst_dma_resp_t axi_dma_res; + interrupts_t irq; + dma_events_t dma_core_events; + + sync #(.STAGES (2)) + i_sync_debug (.clk_i, .rst_ni, .serial_i (debug_req_i[i]), .serial_o (irq.debug)); + sync #(.STAGES (2)) + i_sync_meip (.clk_i, .rst_ni, .serial_i (meip_i[i]), .serial_o (irq.meip)); + sync #(.STAGES (2)) + i_sync_mtip (.clk_i, .rst_ni, .serial_i (mtip_i[i]), .serial_o (irq.mtip)); + sync #(.STAGES (2)) + i_sync_msip (.clk_i, .rst_ni, .serial_i (msip_i[i]), .serial_o (irq.msip)); + assign irq.mcip = cl_interrupt[i]; + + tcdm_req_t [TcdmPorts-1:0] tcdm_req_wo_user; + + logic [31:0] hart_id; + assign hart_id = hart_base_id_i + i; + + spatz_cc #( + .BootAddr (BootAddr ), + .L2Addr (L2Addr ), + .L2Size (L2Size ), + .RVE (1'b0 ), + .RVF (RVF ), + .RVD (RVD ), + .RVV (RVV ), + .Xdma (Xdma[i] ), + .AddrWidth (AxiAddrWidth ), + .DataWidth (NarrowDataWidth ), + .UserWidth (AxiUserWidth ), + .DMADataWidth (AxiDataWidth ), + .DMAIdWidth (AxiIdWidthIn ), + .SnitchPMACfg (SnitchPMACfg ), + .DMAAxiReqFifoDepth (DMAAxiReqFifoDepth ), + .DMAReqFifoDepth (DMAReqFifoDepth ), + .dreq_t (reqrsp_req_t ), + .drsp_t (reqrsp_rsp_t ), + .tcdm_req_t (tcdm_req_t ), + .tcdm_req_chan_t (tcdm_req_chan_t ), + .tcdm_rsp_t (tcdm_rsp_t ), + .tcdm_rsp_chan_t (tcdm_rsp_chan_t ), + .axi_req_t (axi_mst_dma_req_t ), + .axi_ar_chan_t (axi_mst_dma_ar_chan_t ), + .axi_aw_chan_t (axi_mst_dma_aw_chan_t ), + .axi_rsp_t (axi_mst_dma_resp_t ), + .hive_req_t (hive_req_t ), + .hive_rsp_t (hive_rsp_t ), + .acc_issue_req_t (acc_issue_req_t ), + .acc_issue_rsp_t (acc_issue_rsp_t ), + .acc_rsp_t (acc_rsp_t ), + .dma_events_t (dma_events_t ), + .dma_perf_t (axi_dma_pkg::dma_perf_t ), + .XDivSqrt (1'b0 ), + .XF16 (1'b1 ), + .XF16ALT (1'b1 ), + .XF8 (1'b1 ), + .XF8ALT (1'b1 ), + .IsoCrossing (1'b0 ), + .NumIntOutstandingLoads (NumIntOutstandingLoads [i]), + .NumIntOutstandingMem (NumIntOutstandingMem [i]), + .NumSpatzOutstandingLoads(NumSpatzOutstandingLoads[i]), + .FPUImplementation (FPUImplementation [i]), + .RegisterOffloadRsp (RegisterOffloadRsp ), + .RegisterCoreReq (RegisterCoreReq ), + .RegisterCoreRsp (RegisterCoreRsp ), + .NumSpatzFPUs (NumSpatzFPUs ), + .NumSpatzIPUs (NumSpatzIPUs ), + .TCDMAddrWidth (SPMAddrWidth ) + ) i_spatz_cc ( + .clk_i (clk_i ), + .rst_ni (rst_ni ), + .testmode_i (1'b0 ), + .hart_id_i (hart_id ), + .hive_req_o (hive_req[i] ), + .hive_rsp_i (hive_rsp[i] ), + .irq_i (irq ), + .data_req_o (core_req[i] ), + .data_rsp_i (core_rsp[i] ), + .tcdm_req_o (tcdm_req_wo_user ), + .tcdm_rsp_i (tcdm_rsp[TcdmPortsOffs +: TcdmPorts]), + .axi_dma_req_o (axi_dma_req ), + .axi_dma_res_i (axi_dma_res ), + .axi_dma_busy_o (/* Unused */ ), + .axi_dma_perf_o (/* Unused */ ), + .axi_dma_events_o (dma_core_events ), + .core_events_o (core_events[i] ), + .tcdm_addr_base_i (tcdm_start_address ) + ); + for (genvar j = 0; j < TcdmPorts; j++) begin : gen_tcdm_user + always_comb begin + tcdm_req[TcdmPortsOffs+j].q = tcdm_req_wo_user[j].q; + tcdm_req[TcdmPortsOffs+j].q.user.core_id = i[CoreIDWidth-1:0]; + tcdm_req[TcdmPortsOffs+j].q.user.is_core = 1; + tcdm_req[TcdmPortsOffs+j].q_valid = tcdm_req_wo_user[j].q_valid; + end + end + if (Xdma[i]) begin : gen_dma_connection + assign wide_axi_mst_req[SDMAMst] = axi_dma_req; + assign axi_dma_res = wide_axi_mst_rsp[SDMAMst]; + assign dma_events = dma_core_events; + end else begin + assign axi_dma_res = '0; + end + end + + // ---------------- + // Instruction Cache + // ---------------- + + addr_t [NrCores-1:0] inst_addr; + logic [NrCores-1:0] inst_cacheable; + logic [NrCores-1:0][31:0] inst_data; + logic [NrCores-1:0] inst_valid; + logic [NrCores-1:0] inst_ready; + logic [NrCores-1:0] inst_error; + logic [NrCores-1:0] flush_valid; + logic [NrCores-1:0] flush_ready; + + for (genvar i = 0; i < NrCores; i++) begin : gen_unpack_icache + assign inst_addr[i] = hive_req[i].inst_addr; + assign inst_cacheable[i] = hive_req[i].inst_cacheable; + assign inst_valid[i] = hive_req[i].inst_valid; + assign flush_valid[i] = hive_req[i].flush_i_valid; + assign hive_rsp[i] = '{ + inst_data : inst_data[i], + inst_ready : inst_ready[i], + inst_error : inst_error[i], + flush_i_ready: flush_ready[i], + default : '0 + }; + end + + snitch_icache #( + .NR_FETCH_PORTS ( NrCores ), + .L0_LINE_COUNT ( 8 ), + .LINE_WIDTH ( ICacheLineWidth ), + .LINE_COUNT ( ICacheLineCount ), + .SET_COUNT ( ICacheSets ), + .FETCH_AW ( AxiAddrWidth ), + .FETCH_DW ( 32 ), + .FILL_AW ( AxiAddrWidth ), + .FILL_DW ( AxiDataWidth ), + .EARLY_LATCH ( 0 ), + .L0_EARLY_TAG_WIDTH ( snitch_pkg::PAGE_SHIFT - $clog2(ICacheLineWidth/8) ), + .ISO_CROSSING ( 1'b0 ), + .axi_req_t ( axi_mst_dma_req_t ), + .axi_rsp_t ( axi_mst_dma_resp_t ), + .sram_cfg_data_t ( impl_in_t ), + .sram_cfg_tag_t ( impl_in_t ) + ) i_snitch_icache ( + .clk_i ( clk_i ), + .clk_d2_i ( clk_i ), + .rst_ni ( rst_ni ), + .enable_prefetching_i ( icache_prefetch_enable ), + .icache_events_o ( icache_events ), + .flush_valid_i ( flush_valid ), + .flush_ready_o ( flush_ready ), + .inst_addr_i ( inst_addr ), + .inst_cacheable_i ( inst_cacheable ), + .inst_data_o ( inst_data ), + .inst_valid_i ( inst_valid ), + .inst_ready_o ( inst_ready ), + .inst_error_o ( inst_error ), + .sram_cfg_tag_i ( '0 ), + .sram_cfg_data_i ( '0 ), + .axi_req_o ( wide_axi_mst_req[ICache] ), + .axi_rsp_i ( wide_axi_mst_rsp[ICache] ) + ); + + // -------- + // Cores SoC + // -------- + spatz_barrier #( + .AddrWidth (AxiAddrWidth ), + .NrPorts (NrCores ), + .dreq_t (reqrsp_req_t ), + .drsp_t (reqrsp_rsp_t ) + ) i_snitch_barrier ( + .clk_i (clk_i ), + .rst_ni (rst_ni ), + .in_req_i (core_req ), + .in_rsp_o (core_rsp ), + .out_req_o (filtered_core_req ), + .out_rsp_i (filtered_core_rsp ), + .cluster_periph_start_address_i (cluster_periph_start_address) + ); + + reqrsp_req_t core_to_axi_req; + reqrsp_rsp_t core_to_axi_rsp; + user_t cluster_user; + // Atomic ID, needs to be unique ID of cluster + // cluster_id + HartIdOffset + 1 (because 0 is for non-atomic masters) + assign cluster_user = (hart_base_id_i / NrCores) + (hart_base_id_i % NrCores) + 1'b1; + + reqrsp_mux #( + .NrPorts (NrCores ), + .AddrWidth (AxiAddrWidth ), + .DataWidth (NarrowDataWidth ), + .req_t (reqrsp_req_t ), + .rsp_t (reqrsp_rsp_t ), + .RespDepth (2 ) + ) i_reqrsp_mux_core ( + .clk_i (clk_i ), + .rst_ni (rst_ni ), + .slv_req_i (filtered_core_req), + .slv_rsp_o (filtered_core_rsp), + .mst_req_o (core_to_axi_req ), + .mst_rsp_i (core_to_axi_rsp ), + .idx_o (/*unused*/ ) + ); + + reqrsp_to_axi #( + .DataWidth (NarrowDataWidth), + .UserWidth (NarrowUserWidth), + .reqrsp_req_t (reqrsp_req_t ), + .reqrsp_rsp_t (reqrsp_rsp_t ), + .axi_req_t (axi_mst_req_t ), + .axi_rsp_t (axi_mst_resp_t ) + ) i_reqrsp_to_axi_core ( + .clk_i (clk_i ), + .rst_ni (rst_ni ), + .user_i (cluster_user ), + .reqrsp_req_i (core_to_axi_req ), + .reqrsp_rsp_o (core_to_axi_rsp ), + .axi_req_o (narrow_axi_mst_req[CoreReq]), + .axi_rsp_i (narrow_axi_mst_rsp[CoreReq]) + ); + + xbar_rule_t [NrNarrowRules-1:0] cluster_xbar_rules; + + assign cluster_xbar_rules = '{ + '{ + idx : TCDM, + start_addr: tcdm_start_address, + end_addr : tcdm_end_address + }, + '{ + idx : ClusterPeripherals, + start_addr: cluster_periph_start_address, + end_addr : cluster_periph_end_address + } + }; + + localparam bit [ClusterXbarCfg.NoSlvPorts-1:0] ClusterEnableDefaultMstPort = '1; + localparam logic [ClusterXbarCfg.NoSlvPorts-1:0][cf_math_pkg::idx_width(ClusterXbarCfg.NoMstPorts)-1:0] ClusterXbarDefaultPort = '{default: SoC}; + + axi_xbar #( + .Cfg (ClusterXbarCfg ), + .slv_aw_chan_t (axi_mst_aw_chan_t), + .mst_aw_chan_t (axi_slv_aw_chan_t), + .w_chan_t (axi_mst_w_chan_t ), + .slv_b_chan_t (axi_mst_b_chan_t ), + .mst_b_chan_t (axi_slv_b_chan_t ), + .slv_ar_chan_t (axi_mst_ar_chan_t), + .mst_ar_chan_t (axi_slv_ar_chan_t), + .slv_r_chan_t (axi_mst_r_chan_t ), + .mst_r_chan_t (axi_slv_r_chan_t ), + .slv_req_t (axi_mst_req_t ), + .slv_resp_t (axi_mst_resp_t ), + .mst_req_t (axi_slv_req_t ), + .mst_resp_t (axi_slv_resp_t ), + .rule_t (xbar_rule_t ) + ) i_cluster_xbar ( + .clk_i (clk_i ), + .rst_ni (rst_ni ), + .test_i (1'b0 ), + .slv_ports_req_i (narrow_axi_mst_req ), + .slv_ports_resp_o (narrow_axi_mst_rsp ), + .mst_ports_req_o (narrow_axi_slv_req ), + .mst_ports_resp_i (narrow_axi_slv_rsp ), + .addr_map_i (cluster_xbar_rules ), + .en_default_mst_port_i (ClusterEnableDefaultMstPort), + .default_mst_port_i (ClusterXbarDefaultPort ) + ); + + // --------- + // Slaves + // --------- + // 1. TCDM + // Add an adapter that allows access from AXI to the TCDM. + axi_to_tcdm #( + .axi_req_t (axi_slv_req_t ), + .axi_rsp_t (axi_slv_resp_t ), + .tcdm_req_t (spm_req_t ), + .tcdm_rsp_t (spm_rsp_t ), + .AddrWidth (AxiAddrWidth ), + .DataWidth (NarrowDataWidth ), + .IdWidth (NarrowIdWidthOut ), + .BufDepth (MemoryMacroLatency + 1) + ) i_axi_to_tcdm ( + .clk_i (clk_i ), + .rst_ni (rst_ni ), + .axi_req_i (narrow_axi_slv_req[TCDM]), + .axi_rsp_o (narrow_axi_slv_rsp[TCDM]), + .tcdm_req_o (axi_soc_req ), + .tcdm_rsp_i (axi_soc_rsp ) + ); + + // // 2. Peripherals + // // Peripherals at cluster level. + // assign axi_narrow_req_o[TilePeriph] = narrow_axi_slv_req[ClusterPeripherals]; + // assign narrow_axi_slv_rsp[ClusterPeripherals] = axi_narrow_rsp_i[TilePeriph]; + + // 2. Peripherals + // Diyou: should we move it to cluster level? + axi_to_reg #( + .ADDR_WIDTH (AxiAddrWidth ), + .DATA_WIDTH (NarrowDataWidth ), + .AXI_MAX_WRITE_TXNS (1 ), + .AXI_MAX_READ_TXNS (1 ), + .DECOUPLE_W (0 ), + .ID_WIDTH (NarrowIdWidthOut ), + .USER_WIDTH (NarrowUserWidth ), + .axi_req_t (axi_slv_req_t ), + .axi_rsp_t (axi_slv_resp_t ), + .reg_req_t (reg_req_t ), + .reg_rsp_t (reg_rsp_t ) + ) i_axi_to_reg ( + .clk_i (clk_i ), + .rst_ni (rst_ni ), + .testmode_i (1'b0 ), + .axi_req_i (narrow_axi_slv_req[ClusterPeripherals]), + .axi_rsp_o (narrow_axi_slv_rsp[ClusterPeripherals]), + .reg_req_o (reg_req ), + .reg_rsp_i (reg_rsp ) + ); + + spatz_cluster_peripheral #( + .AddrWidth (AxiAddrWidth ), + .SPMWidth ($clog2(L1NumSet)), + .reg_req_t (reg_req_t ), + .reg_rsp_t (reg_rsp_t ), + .tcdm_events_t (tcdm_events_t ), + .dma_events_t (dma_events_t ), + .NrCores (NrCores ) + ) i_snitch_cluster_peripheral ( + .clk_i (clk_i ), + .rst_ni (rst_ni ), + .reg_req_i (reg_req ), + .reg_rsp_o (reg_rsp ), + /// The TCDM always starts at the cluster base. + .tcdm_start_address_i (tcdm_start_address ), + .tcdm_end_address_i (tcdm_end_address ), + .icache_prefetch_enable_o (icache_prefetch_enable), + .cl_clint_o (cl_interrupt ), + .cluster_hart_base_id_i (hart_base_id_i ), + .core_events_i (core_events ), + .tcdm_events_i (tcdm_events ), + .dma_events_i (dma_events ), + .icache_events_i (icache_events ), + .cluster_probe_o (tile_probe_o ), + .dynamic_offset_o (dynamic_offset ), + .l1d_spm_size_o (cfg_spm_size ), + .l1d_insn_o (l1d_insn ), + .l1d_insn_valid_o (l1d_insn_valid ), + // TODO: Here we only check controller 0 + .l1d_insn_ready_i (l1d_insn_ready[0] ), + .l1d_busy_o (l1d_busy ) + ); + + // 3. BootROM + assign axi_wide_req_o[TileBootROM] = wide_axi_slv_req[BootROM]; + assign wide_axi_slv_rsp[BootROM] = axi_wide_rsp_i[TileBootROM]; + + // Upsize the narrow SoC connection + `AXI_TYPEDEF_ALL(axi_mst_dma_narrow, addr_t, id_dma_mst_t, data_t, strb_t, user_t) + axi_mst_dma_narrow_req_t narrow_axi_slv_req_soc; + axi_mst_dma_narrow_resp_t narrow_axi_slv_resp_soc; + + axi_iw_converter #( + .AxiAddrWidth (AxiAddrWidth ), + .AxiDataWidth (NarrowDataWidth ), + .AxiUserWidth (AxiUserWidth ), + .AxiSlvPortIdWidth (NarrowIdWidthOut ), + .AxiSlvPortMaxUniqIds (1 ), + .AxiSlvPortMaxTxnsPerId(1 ), + .AxiSlvPortMaxTxns (1 ), + .AxiMstPortIdWidth (WideIdWidthIn ), + .AxiMstPortMaxUniqIds (1 ), + .AxiMstPortMaxTxnsPerId(1 ), + .slv_req_t (axi_slv_req_t ), + .slv_resp_t (axi_slv_resp_t ), + .mst_req_t (axi_mst_dma_narrow_req_t ), + .mst_resp_t (axi_mst_dma_narrow_resp_t) + ) i_soc_port_iw_convert ( + .clk_i (clk_i ), + .rst_ni (rst_ni ), + .slv_req_i (narrow_axi_slv_req[SoC] ), + .slv_resp_o (narrow_axi_slv_rsp[SoC] ), + .mst_req_o (narrow_axi_slv_req_soc ), + .mst_resp_i (narrow_axi_slv_resp_soc ) + ); + + axi_dw_converter #( + .AxiAddrWidth (AxiAddrWidth ), + .AxiIdWidth (WideIdWidthIn ), + .AxiMaxReads (2 ), + .AxiSlvPortDataWidth(NarrowDataWidth ), + .AxiMstPortDataWidth(AxiDataWidth ), + .ar_chan_t (axi_mst_dma_ar_chan_t ), + .aw_chan_t (axi_mst_dma_aw_chan_t ), + .b_chan_t (axi_mst_dma_b_chan_t ), + .slv_r_chan_t (axi_mst_dma_narrow_r_chan_t), + .slv_w_chan_t (axi_mst_dma_narrow_b_chan_t), + .axi_slv_req_t (axi_mst_dma_narrow_req_t ), + .axi_slv_resp_t (axi_mst_dma_narrow_resp_t ), + .mst_r_chan_t (axi_mst_dma_r_chan_t ), + .mst_w_chan_t (axi_mst_dma_w_chan_t ), + .axi_mst_req_t (axi_mst_dma_req_t ), + .axi_mst_resp_t (axi_mst_dma_resp_t ) + ) i_soc_port_dw_upsize ( + .clk_i (clk_i ), + .rst_ni (rst_ni ), + .slv_req_i (narrow_axi_slv_req_soc ), + .slv_resp_o (narrow_axi_slv_resp_soc ), + .mst_req_o (wide_axi_mst_req[CoreReqWide]), + .mst_resp_i (wide_axi_mst_rsp[CoreReqWide]) + ); + + // -------------------- + // TCDM event counters + // -------------------- + logic [NrTCDMPortsCores-1:0] flat_acc, flat_con; + for (genvar i = 0; i < NrTCDMPortsCores; i++) begin : gen_event_counter + `FFARN(flat_acc[i], tcdm_req[i].q_valid, '0, clk_i, rst_ni) + `FFARN(flat_con[i], tcdm_req[i].q_valid & ~tcdm_rsp[i].q_ready, '0, clk_i, rst_ni) + end + + popcount #( + .INPUT_WIDTH ( NrTCDMPortsCores ) + ) i_popcount_req ( + .data_i ( flat_acc ), + .popcount_o ( tcdm_events.inc_accessed ) + ); + + popcount #( + .INPUT_WIDTH ( NrTCDMPortsCores ) + ) i_popcount_con ( + .data_i ( flat_con ), + .popcount_o ( tcdm_events.inc_congested ) + ); + + // ------------- + // Sanity Checks + // ------------- + // Sanity check the parameters. Not every configuration makes sense. + `ASSERT_INIT(CheckSuperBankSanity, NrBanks >= BanksPerSuperBank); + `ASSERT_INIT(CheckSuperBankFactor, (NrBanks % BanksPerSuperBank) == 0); + // Check that the cluster base address aligns to the TCDMSize. + `ASSERT(ClusterBaseAddrAlign, ((TCDMSize - 1) & cluster_base_addr_i) == 0) + // Make sure we only have one DMA in the system. + `ASSERT_INIT(NumberDMA, $onehot0(Xdma)) + +endmodule diff --git a/hardware/src/tcdm_cache_interco.sv b/hardware/src/tcdm_cache_interco.sv new file mode 100644 index 0000000..2dc49cf --- /dev/null +++ b/hardware/src/tcdm_cache_interco.sv @@ -0,0 +1,221 @@ +// Copyright 2025 ETH Zurich and University of Bologna. +// Solderpad Hardware License, Version 0.51, see LICENSE for details. +// SPDX-License-Identifier: SHL-0.51 + +// Author: Diyou Shen + +// The cache xbar used to select the cache banks + +module tcdm_cache_interco #( + /// Number of inputs into the interconnect (`> 0`). + parameter int unsigned NumCore = 32'd0, + /// Number of outputs from the interconnect (`> 0`). + parameter int unsigned NumCache = 32'd0, + /// Offset bits based on cacheline: 512b => 6 bits + parameter int unsigned AddrWidth = 32'd32, + + /// Port type of the data request ports. + parameter type tcdm_req_t = logic, + /// Port type of the data response ports. + parameter type tcdm_rsp_t = logic, + /// Payload type of the data request ports. + parameter type tcdm_req_chan_t = logic, + /// Payload type of the data response ports. + parameter type tcdm_rsp_chan_t = logic, + + parameter snitch_pkg::topo_e Topology = snitch_pkg::LogarithmicInterconnect +) ( + /// Clock, positive edge triggered. + input logic clk_i, + /// Reset, active low. + input logic rst_ni, + /// Dynamic address offset for cache bank selection + input logic [$clog2(AddrWidth)-1:0] dynamic_offset_i, + /// Request port. + input tcdm_req_t [NumCore-1:0] core_req_i, + /// Response ready in + input logic [NumCore-1:0] core_rsp_ready_i, + /// Resposne port. + output tcdm_rsp_t [NumCore-1:0] core_rsp_o, + /// Memory Side + /// Request. + output tcdm_req_t [NumCache-1:0] mem_req_o, + /// Response ready out + output logic [NumCache-1:0] mem_rsp_ready_o, + /// Response. + input tcdm_rsp_t [NumCache-1:0] mem_rsp_i +); + + // -------- + // Parameters and Signals + // -------- + + // Selection signal width and types + localparam int unsigned NumMemSelBits = $clog2(NumCache); + localparam int unsigned NumCoreSelBits = $clog2(NumCore); + + typedef logic [NumMemSelBits-1 :0] mem_sel_t; + typedef logic [NumCoreSelBits-1:0] core_sel_t; + + // core select which cache bank to go + core_sel_t [NumCore-1:0] core_req_sel; + mem_sel_t [NumCache -1:0] mem_rsp_sel; + + // Number of bits used to identify the cache bank + localparam int unsigned CacheBankBits = $clog2(NumCore); + + tcdm_req_chan_t [NumCore-1:0] core_req; + logic [NumCore-1:0] core_req_valid, core_req_ready; + + tcdm_req_chan_t [NumCache -1:0] mem_req; + logic [NumCache -1:0] mem_req_valid, mem_req_ready; + + tcdm_rsp_chan_t [NumCore-1:0] core_rsp; + logic [NumCore-1:0] core_rsp_valid, core_rsp_ready; + + tcdm_rsp_chan_t [NumCache -1:0] mem_rsp; + logic [NumCache -1:0] mem_rsp_valid, mem_rsp_ready; + + + // -------- + // Xbar + // -------- + + stream_xbar #( + .NumInp (NumCore ), + .NumOut (NumCache ), + .payload_t (tcdm_req_chan_t ) + ) i_req_xbar ( + .clk_i (clk_i ), + .rst_ni (rst_ni ), + .flush_i(1'b0 ), + // External priority flag + .rr_i ('0 ), + // Master + .data_i (core_req ), + .valid_i(core_req_valid ), + .ready_o(core_req_ready ), + .sel_i (core_req_sel ), + // Slave + .data_o (mem_req ), + .valid_o(mem_req_valid ), + .ready_i(mem_req_ready ), + .idx_o (/* Unused */ ) + ); + + stream_xbar #( + .NumInp (NumCache ), + .NumOut (NumCore ), + .payload_t (tcdm_rsp_chan_t ) + ) i_rsp_xbar ( + .clk_i (clk_i ), + .rst_ni (rst_ni ), + .flush_i(1'b0 ), + // External priority flag + .rr_i ('0 ), + // Master + .data_i (mem_rsp ), + .valid_i(mem_rsp_valid ), + .ready_o(mem_rsp_ready ), + .sel_i (mem_rsp_sel ), + // Slave + .data_o (core_rsp ), + .valid_o(core_rsp_valid ), + .ready_i(core_rsp_ready ), + .idx_o (/* Unused */ ) + ); + + // -------- + // Selection Signals + // -------- + + // select the target cache bank based on the `bank` bits + // Example: 128 KiB total, 4 way, 4 cache banks, 512b cacheline + // => 128*1024 = 2^17 Byte => 2^(17-6) = 2^11 cachelines + // => 2^11/4 = 2^9 sets per cache bank => 2^9/4 = 2^7 sets per way per cache bank + // => 7 bits index; 2 bits cache bank bits; + // addr: Tag: [31:14]; Index: [13:7]; Cache Bank: [7:6]; Offset: [5:0] + for (genvar port = 0; port < NumCore; port++) begin : gen_req_sel + // assign core_req_sel[port] = core_req[port].addr[13:12]; + assign core_req_sel[port] = core_req[port].addr[dynamic_offset_i+:CacheBankBits]; + end + + // forward response to the sender core + for (genvar port = 0; port < NumCache; port++) begin : gen_rsp_sel + assign mem_rsp_sel[port] = mem_rsp[port].user.core_id; + end + + + + + // -------- + // Registers + // -------- + + for (genvar port = 0; port < NumCore; port++) begin : gen_cache_interco_reg + spill_register #( + .T (tcdm_req_chan_t ) + ) i_tcdm_req_reg ( + .clk_i (clk_i ), + .rst_ni (rst_ni ), + .data_i (core_req_i[port].q ), + .valid_i(core_req_i[port].q_valid ), + .ready_o(core_rsp_o[port].q_ready ), + .data_o (core_req[port] ), + .valid_o(core_req_valid[port] ), + .ready_i(core_req_ready[port] ) + ); + + fall_through_register #( + .T (tcdm_rsp_chan_t ) + ) i_tcdm_rsp_reg ( + .clk_i (clk_i ), + .rst_ni (rst_ni ), + .clr_i (1'b0 ), + .testmode_i(1'b0 ), + .data_i (core_rsp[port] ), + .valid_i (core_rsp_valid[port] ), + .ready_o (core_rsp_ready[port] ), + .data_o (core_rsp_o[port].p ), + .valid_o (core_rsp_o[port].p_valid ), + .ready_i (core_rsp_ready_i[port] ) + ); + end + + + // -------- + // IO Assignment + // -------- + + // We will also take away the offset bits we used from the full address for scrambling + + logic [AddrWidth-1:0] bitmask_up, bitmask_lo; + // These are the address we will keep from original + assign bitmask_lo = (1 << dynamic_offset_i) - 1; + // We will keep AddrWidth - Offset - log2(CacheBanks) bits in the upper half, and remove the NumMemSelBits bits + assign bitmask_up = ((1 << (AddrWidth - dynamic_offset_i - NumMemSelBits)) - 1) << dynamic_offset_i; + + + for (genvar port = 0; port < NumCache; port++) begin : gen_cache_io + always_comb begin + mem_req_o[port] = '{ + q: mem_req[port], + q_valid: mem_req_valid[port], + default: '0 + }; + + // remove the middle two bits + mem_req_o[port].q.addr = (mem_req[port].addr & bitmask_lo) | + ((mem_req[port].addr >> NumMemSelBits) & bitmask_up); + + end + + assign mem_rsp[port] = mem_rsp_i[port].p; + assign mem_rsp_valid[port] = mem_rsp_i[port].p_valid; + assign mem_req_ready[port] = mem_rsp_i[port].q_ready; + end + + assign mem_rsp_ready_o = mem_rsp_ready; + + +endmodule diff --git a/hardware/tb/cachepool_cluster_wrapper.sv b/hardware/tb/cachepool_cluster_wrapper.sv new file mode 100644 index 0000000..76db84f --- /dev/null +++ b/hardware/tb/cachepool_cluster_wrapper.sv @@ -0,0 +1,176 @@ +// Copyright 2021 ETH Zurich and University of Bologna. +// Solderpad Hardware License, Version 0.51, see LICENSE for details. +// SPDX-License-Identifier: SHL-0.51 + + + +`include "axi/typedef.svh" + +module cachepool_cluster_wrapper + import cachepool_pkg::*; + import fpnew_pkg::fpu_implementation_t; + import snitch_pma_pkg::snitch_pma_t; + #( + parameter int unsigned AxiAddrWidth = SpatzAxiAddrWidth, + parameter int unsigned AxiDataWidth = SpatzAxiDataWidth, + parameter int unsigned AxiUserWidth = SpatzAxiUserWidth, + parameter int unsigned AxiInIdWidth = SpatzAxiIdInWidth, + parameter int unsigned AxiOutIdWidth = SpatzAxiIdOutWidth, + + parameter type axi_in_resp_t = spatz_axi_in_resp_t, + parameter type axi_in_req_t = spatz_axi_in_req_t, + + parameter type axi_out_resp_t = spatz_axi_out_resp_t, + parameter type axi_out_req_t = spatz_axi_out_req_t +)( + input logic clk_i, + input logic rst_ni, + input logic [NumCores-1:0] debug_req_i, + + input logic [NumCores-1:0] meip_i, + input logic [NumCores-1:0] mtip_i, + input logic [NumCores-1:0] msip_i, + output logic cluster_probe_o, + input axi_in_req_t axi_in_req_i, + output axi_in_resp_t axi_in_resp_o, + output axi_out_req_t axi_out_req_o, + input axi_out_resp_t axi_out_resp_i, + output axi_out_req_t axi_out_l2_req_o, + input axi_out_resp_t axi_out_l2_resp_i +); + + localparam int unsigned NumIntOutstandingLoads [NumCores] = '{default: 16}; + localparam int unsigned NumIntOutstandingMem [NumCores] = '{default: 16}; + localparam int unsigned NumSpatzOutstandingLoads [NumCores] = '{default: 16}; + + spatz_axi_iwc_out_req_t axi_from_cluster_iwc_req; + spatz_axi_iwc_out_resp_t axi_from_cluster_iwc_resp; + spatz_axi_iwc_out_req_t axi_from_cluster_l2_req; + spatz_axi_iwc_out_resp_t axi_from_cluster_l2_resp; + + axi_iw_converter #( + .AxiSlvPortIdWidth ( IwcAxiIdOutWidth ), + .AxiMstPortIdWidth ( AxiOutIdWidth ), + .AxiSlvPortMaxUniqIds ( 2 ), + .AxiSlvPortMaxTxnsPerId ( 2 ), + .AxiSlvPortMaxTxns ( 4 ), + .AxiMstPortMaxUniqIds ( 2 ), + .AxiMstPortMaxTxnsPerId ( 4 ), + .AxiAddrWidth ( AxiAddrWidth ), + .AxiDataWidth ( AxiDataWidth ), + .AxiUserWidth ( AxiUserWidth ), + .slv_req_t ( spatz_axi_iwc_out_req_t ), + .slv_resp_t ( spatz_axi_iwc_out_resp_t), + .mst_req_t ( axi_out_req_t ), + .mst_resp_t ( axi_out_resp_t ) + ) iw_converter( + .clk_i ( clk_i ), + .rst_ni ( rst_ni ), + .slv_req_i ( axi_from_cluster_iwc_req ), + .slv_resp_o ( axi_from_cluster_iwc_resp ), + .mst_req_o ( axi_out_req_o ), + .mst_resp_i ( axi_out_resp_i ) + ); + + axi_iw_converter #( + .AxiSlvPortIdWidth ( IwcAxiIdOutWidth ), + .AxiMstPortIdWidth ( AxiOutIdWidth ), + .AxiSlvPortMaxUniqIds ( 2 ), + .AxiSlvPortMaxTxnsPerId ( 2 ), + .AxiSlvPortMaxTxns ( 4 ), + .AxiMstPortMaxUniqIds ( 2 ), + .AxiMstPortMaxTxnsPerId ( 4 ), + .AxiAddrWidth ( AxiAddrWidth ), + .AxiDataWidth ( AxiDataWidth ), + .AxiUserWidth ( AxiUserWidth ), + .slv_req_t ( spatz_axi_iwc_out_req_t ), + .slv_resp_t ( spatz_axi_iwc_out_resp_t), + .mst_req_t ( axi_out_req_t ), + .mst_resp_t ( axi_out_resp_t ) + ) iw_converter_l2( + .clk_i ( clk_i ), + .rst_ni ( rst_ni ), + .slv_req_i ( axi_from_cluster_l2_req ), + .slv_resp_o ( axi_from_cluster_l2_resp ), + .mst_req_o ( axi_out_l2_req_o ), + .mst_resp_i ( axi_out_l2_resp_i ) + ); + + // Spatz cluster under test. + cachepool_cluster #( + .AxiAddrWidth (AxiAddrWidth ), + .AxiDataWidth (AxiDataWidth ), + .AxiIdWidthIn (AxiInIdWidth ), + .AxiIdWidthOut (IwcAxiIdOutWidth ), + .AxiUserWidth (AxiUserWidth ), + .BootAddr (BootAddr ), + .L2Addr (L2Addr ), + .L2Size (L2Size ), + .ClusterPeriphSize (64 ), + .NrCores (NumCores ), + .TCDMDepth (TCDMDepth ), + .NrBanks (NumBank ), + .ICacheLineWidth (ICacheLineWidth ), + .ICacheLineCount (ICacheLineCount ), + .ICacheSets (ICacheSets ), + .FPUImplementation (FPUImplementation ), + .NumSpatzFPUs (NFpu ), + .NumSpatzIPUs (NIpu ), + .SnitchPMACfg (SnitchPMACfg ), + .NumIntOutstandingLoads (NumIntOutstandingLoads ), + .NumIntOutstandingMem (NumIntOutstandingMem ), + .NumSpatzOutstandingLoads (NumSpatzOutstandingLoads ), + .axi_in_req_t (axi_in_req_t ), + .axi_in_resp_t (axi_in_resp_t ), + .axi_out_req_t (spatz_axi_iwc_out_req_t ), + .axi_out_resp_t (spatz_axi_iwc_out_resp_t ), + .Xdma (4'h1 ), + .DMAAxiReqFifoDepth (3 ), + .DMAReqFifoDepth (3 ), + .RegisterOffloadRsp (1 ), + .RegisterCoreReq (1 ), + .RegisterCoreRsp (1 ), + .RegisterTCDMCuts (1 ), + .RegisterExt (0 ), + .XbarLatency (axi_pkg::CUT_ALL_PORTS ), + .MaxMstTrans (4 ), + .MaxSlvTrans (4 ) + ) i_cluster ( + .clk_i, + .rst_ni, + .impl_i( '0 ), + .error_o(), + .debug_req_i, + .meip_i, + .mtip_i, + .msip_i, + .hart_base_id_i (10'h10), + .cluster_base_addr_i (TCDMStartAddr), + .cluster_probe_o, + .axi_in_req_i, + .axi_in_resp_o, + // AXI Master Port + .axi_out_req_o ( axi_from_cluster_iwc_req ), + .axi_out_resp_i ( axi_from_cluster_iwc_resp ), + .axi_out_l2_req_o ( axi_from_cluster_l2_req ), + .axi_out_l2_resp_i ( axi_from_cluster_l2_resp ) + ); + + // Assertions + + if (AxiAddrWidth != SpatzAxiAddrWidth) + $error("[spatz_cluster_wrapper] AXI Address Width does not match the configuration."); + + if (AxiDataWidth != SpatzAxiDataWidth) + $error("[spatz_cluster_wrapper] AXI Data Width does not match the configuration."); + + if (AxiUserWidth != SpatzAxiUserWidth) + $error("[spatz_cluster_wrapper] AXI User Width does not match the configuration."); + + if (AxiInIdWidth != SpatzAxiIdInWidth) + $error("[spatz_cluster_wrapper] AXI Id Width (In) does not match the configuration."); + + if (AxiOutIdWidth != SpatzAxiIdOutWidth) + $error("[spatz_cluster_wrapper] AXI Id Width (Out) does not match the configuration."); + +endmodule diff --git a/hardware/tb/testharness.sv b/hardware/tb/testharness.sv new file mode 100644 index 0000000..1c3b8d8 --- /dev/null +++ b/hardware/tb/testharness.sv @@ -0,0 +1,203 @@ +// Copyright 2021 ETH Zurich and University of Bologna. +// Solderpad Hardware License, Version 0.51, see LICENSE for details. +// SPDX-License-Identifier: SHL-0.51 + +`define wait_for(signal) do @(negedge clk_i); while (!signal); + +`include "axi/assign.svh" +`include "axi/typedef.svh" +`include "reqrsp_interface/typedef.svh" + +module testharness ( + input logic clk_i, + input logic rst_ni + ); + + import cachepool_pkg::*; + import spatz_cluster_peripheral_reg_pkg::*; + import axi_pkg::xbar_cfg_t; + import axi_pkg::xbar_rule_32_t; + + import "DPI-C" function int get_entry_point(); + + /********* + * AXI * + *********/ + + localparam NumAXISlaves = 2; + localparam NumRules = NumAXISlaves-1; + + // Spatz wide port to SoC (currently dram) + spatz_axi_out_req_t axi_from_cluster_req; + spatz_axi_out_resp_t axi_from_cluster_resp; + // Spatz wide port to L2 + spatz_axi_out_req_t axi_l2_req; + spatz_axi_out_resp_t axi_l2_resp; + // From SoC to Spatz + spatz_axi_in_req_t axi_to_cluster_req; + spatz_axi_in_resp_t axi_to_cluster_resp; + + + /********* + * DUT * + *********/ + + logic cluster_probe; + logic [NumCores-1:0] debug_req; + + cachepool_cluster_wrapper i_cluster_wrapper ( + .clk_i (clk_i ), + .rst_ni (rst_ni ), + .meip_i ('0 ), + .msip_i ('0 ), + .mtip_i ('0 ), + .debug_req_i ( debug_req ), + .axi_out_req_o (axi_from_cluster_req ), + .axi_out_resp_i (axi_from_cluster_resp), + .axi_out_l2_req_o ( axi_l2_req ), + .axi_out_l2_resp_i ( axi_l2_resp ), + .axi_in_req_i (axi_to_cluster_req ), + .axi_in_resp_o (axi_to_cluster_resp ), + .cluster_probe_o (cluster_probe ) + ); +/************** + * VCD Dump * + **************/ + +`ifdef VCD_DUMP + initial begin: vcd_dump + // Wait for the reset + wait (rst_ni); + + // Wait until the probe is high + while (!cluster_probe) + @(posedge clk_i); + + // Dump signals of group 0 + $dumpfile(`VCD_DUMP_FILE); + $dumpvars(0, i_cluster_wrapper); + $dumpon; + + // Wait until the probe is low + while (cluster_probe) + @(posedge clk_i); + + $dumpoff; + + // Stop the execution + $finish(0); + end: vcd_dump +`endif + + /************************ + * Simulation control * + ************************/ + + `REQRSP_TYPEDEF_ALL(reqrsp_cluster_in, axi_addr_t, logic [63:0], logic [7:0]) + reqrsp_cluster_in_req_t to_cluster_req; + reqrsp_cluster_in_rsp_t to_cluster_rsp; + + reqrsp_to_axi #( + .DataWidth (SpatzDataWidth ), + .UserWidth (SpatzAxiUserWidth ), + .axi_req_t (spatz_axi_in_req_t ), + .axi_rsp_t (spatz_axi_in_resp_t ), + .reqrsp_req_t(reqrsp_cluster_in_req_t), + .reqrsp_rsp_t(reqrsp_cluster_in_rsp_t) + ) i_axi_to_reqrsp ( + .clk_i (clk_i ), + .rst_ni (rst_ni ), + .user_i ('0 ), + .axi_req_o (axi_to_cluster_req ), + .axi_rsp_i (axi_to_cluster_resp), + .reqrsp_req_i(to_cluster_req ), + .reqrsp_rsp_o(to_cluster_rsp ) + ); + + logic [31:0] entry_point; + initial begin + // Idle + to_cluster_req = '0; + debug_req = '0; + + // Wait for a while + repeat (10) + @(negedge clk_i); + + // Load the entry point + entry_point = get_entry_point(); + $display("Loading entry point: %0x", entry_point); + + // Wait for a while + repeat (1000) + @(negedge clk_i); + + // Store the entry point in the Spatz cluster + to_cluster_req = '{ + q: '{ + addr : PeriStartAddr + SPATZ_CLUSTER_PERIPHERAL_CLUSTER_BOOT_CONTROL_OFFSET, + data : {32'b0, entry_point}, + write : 1'b1, + strb : '1, + amo : reqrsp_pkg::AMONone, + default: '0 + }, + q_valid: 1'b1, + p_ready: 1'b0 + }; + `wait_for(to_cluster_rsp.q_ready); + to_cluster_req = '0; + `wait_for(to_cluster_rsp.p_valid); + to_cluster_req = '{ + p_ready: 1'b1, + q : '{ + amo : reqrsp_pkg::AMONone, + default: '0 + }, + default: '0 + }; + @(negedge clk_i); + to_cluster_req = '0; + + + // Wake up cores + debug_req = '1; + @(negedge clk_i); + debug_req = '0; + end + + /******** + * L2 * + ********/ + + // Wide port into simulation memory. + tb_memory_axi #( + .AxiAddrWidth ( SpatzAxiAddrWidth ), + .AxiDataWidth ( SpatzAxiDataWidth ), + .AxiIdWidth ( SpatzAxiIdOutWidth ), + .AxiUserWidth ( SpatzAxiUserWidth ), + .req_t ( spatz_axi_out_req_t ), + .rsp_t ( spatz_axi_out_resp_t ) + ) i_dma ( + .clk_i (clk_i ), + .rst_ni(rst_ni ), + .req_i (axi_from_cluster_req ), + .rsp_o (axi_from_cluster_resp) + ); + + // Wide port into simulation memory. + tb_memory_axi #( + .AxiAddrWidth ( SpatzAxiAddrWidth ), + .AxiDataWidth ( SpatzAxiDataWidth ), + .AxiIdWidth ( SpatzAxiIdOutWidth ), + .AxiUserWidth ( SpatzAxiUserWidth ), + .req_t ( spatz_axi_out_req_t ), + .rsp_t ( spatz_axi_out_resp_t ) + ) i_l2mem ( + .clk_i (clk_i ), + .rst_ni(rst_ni ), + .req_i (axi_l2_req ), + .rsp_o (axi_l2_resp ) + ); + +endmodule : testharness diff --git a/sim/.gitignore b/sim/.gitignore new file mode 100644 index 0000000..c6d0c90 --- /dev/null +++ b/sim/.gitignore @@ -0,0 +1,2 @@ +work +bin diff --git a/sim/scripts/vsim_core.tcl b/sim/scripts/vsim_core.tcl new file mode 100644 index 0000000..12d5b6a --- /dev/null +++ b/sim/scripts/vsim_core.tcl @@ -0,0 +1,180 @@ +# Copyright 2021 ETH Zurich and University of Bologna. +# Solderpad Hardware License, Version 0.51, see LICENSE for details. +# SPDX-License-Identifier: SHL-0.51 + +# Create group for core $1 +onerror {resume} + +add wave -noupdate -group tile[$1]_core[$2] -group Params /tb_bin/i_dut/i_cluster_wrapper/i_cluster/gen_tiles[$1]/i_tile/gen_core[$2]/i_spatz_cc/BootAddr +add wave -noupdate -group tile[$1]_core[$2] /tb_bin/i_dut/i_cluster_wrapper/i_cluster/gen_tiles[$1]/i_tile/gen_core[$2]/i_spatz_cc/i_snitch/clk_i +add wave -noupdate -group tile[$1]_core[$2] /tb_bin/i_dut/i_cluster_wrapper/i_cluster/gen_tiles[$1]/i_tile/gen_core[$2]/i_spatz_cc/i_snitch/rst_i +add wave -noupdate -group tile[$1]_core[$2] -radix unsigned /tb_bin/i_dut/i_cluster_wrapper/i_cluster/gen_tiles[$1]/i_tile/gen_core[$2]/i_spatz_cc/i_snitch/hart_id_i + +add wave -noupdate -group tile[$1]_core[$2] -divider Instructions +add wave -noupdate -group tile[$1]_core[$2] /tb_bin/i_dut/i_cluster_wrapper/i_cluster/gen_tiles[$1]/i_tile/gen_core[$2]/i_spatz_cc/i_snitch/inst_addr_o +add wave -noupdate -group tile[$1]_core[$2] /tb_bin/i_dut/i_cluster_wrapper/i_cluster/gen_tiles[$1]/i_tile/gen_core[$2]/i_spatz_cc/i_snitch/inst_data_i +add wave -noupdate -group tile[$1]_core[$2] /tb_bin/i_dut/i_cluster_wrapper/i_cluster/gen_tiles[$1]/i_tile/gen_core[$2]/i_spatz_cc/i_snitch/inst_valid_o +add wave -noupdate -group tile[$1]_core[$2] /tb_bin/i_dut/i_cluster_wrapper/i_cluster/gen_tiles[$1]/i_tile/gen_core[$2]/i_spatz_cc/i_snitch/inst_ready_i + +add wave -noupdate -group tile[$1]_core[$2] -divider Load/Store +add wave -noupdate -group tile[$1]_core[$2] /tb_bin/i_dut/i_cluster_wrapper/i_cluster/gen_tiles[$1]/i_tile/gen_core[$2]/i_spatz_cc/i_snitch/data_req_o +add wave -noupdate -group tile[$1]_core[$2] /tb_bin/i_dut/i_cluster_wrapper/i_cluster/gen_tiles[$1]/i_tile/gen_core[$2]/i_spatz_cc/i_snitch/data_rsp_i + +add wave -noupdate -group tile[$1]_core[$2] -divider Accelerator +add wave -noupdate -group tile[$1]_core[$2] /tb_bin/i_dut/i_cluster_wrapper/i_cluster/gen_tiles[$1]/i_tile/gen_core[$2]/i_spatz_cc/i_snitch/acc_qreq_o +add wave -noupdate -group tile[$1]_core[$2] /tb_bin/i_dut/i_cluster_wrapper/i_cluster/gen_tiles[$1]/i_tile/gen_core[$2]/i_spatz_cc/i_snitch/acc_qrsp_i +add wave -noupdate -group tile[$1]_core[$2] /tb_bin/i_dut/i_cluster_wrapper/i_cluster/gen_tiles[$1]/i_tile/gen_core[$2]/i_spatz_cc/i_snitch/acc_qvalid_o +add wave -noupdate -group tile[$1]_core[$2] /tb_bin/i_dut/i_cluster_wrapper/i_cluster/gen_tiles[$1]/i_tile/gen_core[$2]/i_spatz_cc/i_snitch/acc_qready_i +add wave -noupdate -group tile[$1]_core[$2] /tb_bin/i_dut/i_cluster_wrapper/i_cluster/gen_tiles[$1]/i_tile/gen_core[$2]/i_spatz_cc/i_snitch/acc_prsp_i +add wave -noupdate -group tile[$1]_core[$2] /tb_bin/i_dut/i_cluster_wrapper/i_cluster/gen_tiles[$1]/i_tile/gen_core[$2]/i_spatz_cc/i_snitch/acc_pvalid_i +add wave -noupdate -group tile[$1]_core[$2] /tb_bin/i_dut/i_cluster_wrapper/i_cluster/gen_tiles[$1]/i_tile/gen_core[$2]/i_spatz_cc/i_snitch/acc_pready_o + +add wave -noupdate -group tile[$1]_core[$2] -group Snitch /tb_bin/i_dut/i_cluster_wrapper/i_cluster/gen_tiles[$1]/i_tile/gen_core[$2]/i_spatz_cc/i_snitch/illegal_inst +add wave -noupdate -group tile[$1]_core[$2] -group Snitch /tb_bin/i_dut/i_cluster_wrapper/i_cluster/gen_tiles[$1]/i_tile/gen_core[$2]/i_spatz_cc/i_snitch/stall +add wave -noupdate -group tile[$1]_core[$2] -group Snitch /tb_bin/i_dut/i_cluster_wrapper/i_cluster/gen_tiles[$1]/i_tile/gen_core[$2]/i_spatz_cc/i_snitch/lsu_stall +add wave -noupdate -group tile[$1]_core[$2] -group Snitch /tb_bin/i_dut/i_cluster_wrapper/i_cluster/gen_tiles[$1]/i_tile/gen_core[$2]/i_spatz_cc/i_snitch/acc_stall +add wave -noupdate -group tile[$1]_core[$2] -group Snitch /tb_bin/i_dut/i_cluster_wrapper/i_cluster/gen_tiles[$1]/i_tile/gen_core[$2]/i_spatz_cc/i_snitch/zero_lsb +add wave -noupdate -group tile[$1]_core[$2] -group Snitch /tb_bin/i_dut/i_cluster_wrapper/i_cluster/gen_tiles[$1]/i_tile/gen_core[$2]/i_spatz_cc/i_snitch/pc_d +add wave -noupdate -group tile[$1]_core[$2] -group Snitch /tb_bin/i_dut/i_cluster_wrapper/i_cluster/gen_tiles[$1]/i_tile/gen_core[$2]/i_spatz_cc/i_snitch/pc_q +add wave -noupdate -group tile[$1]_core[$2] -group Snitch /tb_bin/i_dut/i_cluster_wrapper/i_cluster/gen_tiles[$1]/i_tile/gen_core[$2]/i_spatz_cc/i_snitch/wfi_d +add wave -noupdate -group tile[$1]_core[$2] -group Snitch /tb_bin/i_dut/i_cluster_wrapper/i_cluster/gen_tiles[$1]/i_tile/gen_core[$2]/i_spatz_cc/i_snitch/wfi_q +add wave -noupdate -group tile[$1]_core[$2] -group Snitch /tb_bin/i_dut/i_cluster_wrapper/i_cluster/gen_tiles[$1]/i_tile/gen_core[$2]/i_spatz_cc/i_snitch/fcsr_d +add wave -noupdate -group tile[$1]_core[$2] -group Snitch /tb_bin/i_dut/i_cluster_wrapper/i_cluster/gen_tiles[$1]/i_tile/gen_core[$2]/i_spatz_cc/i_snitch/fcsr_q +add wave -noupdate -group tile[$1]_core[$2] -group Snitch -divider LSU +add wave -noupdate -group tile[$1]_core[$2] -group Snitch /tb_bin/i_dut/i_cluster_wrapper/i_cluster/gen_tiles[$1]/i_tile/gen_core[$2]/i_spatz_cc/i_snitch/ls_size +add wave -noupdate -group tile[$1]_core[$2] -group Snitch /tb_bin/i_dut/i_cluster_wrapper/i_cluster/gen_tiles[$1]/i_tile/gen_core[$2]/i_spatz_cc/i_snitch/ls_amo +add wave -noupdate -group tile[$1]_core[$2] -group Snitch /tb_bin/i_dut/i_cluster_wrapper/i_cluster/gen_tiles[$1]/i_tile/gen_core[$2]/i_spatz_cc/i_snitch/ld_result +add wave -noupdate -group tile[$1]_core[$2] -group Snitch /tb_bin/i_dut/i_cluster_wrapper/i_cluster/gen_tiles[$1]/i_tile/gen_core[$2]/i_spatz_cc/i_snitch/lsu_qready +add wave -noupdate -group tile[$1]_core[$2] -group Snitch /tb_bin/i_dut/i_cluster_wrapper/i_cluster/gen_tiles[$1]/i_tile/gen_core[$2]/i_spatz_cc/i_snitch/lsu_qvalid +add wave -noupdate -group tile[$1]_core[$2] -group Snitch /tb_bin/i_dut/i_cluster_wrapper/i_cluster/gen_tiles[$1]/i_tile/gen_core[$2]/i_spatz_cc/i_snitch/lsu_pvalid +add wave -noupdate -group tile[$1]_core[$2] -group Snitch /tb_bin/i_dut/i_cluster_wrapper/i_cluster/gen_tiles[$1]/i_tile/gen_core[$2]/i_spatz_cc/i_snitch/lsu_pready +add wave -noupdate -group tile[$1]_core[$2] -group Snitch /tb_bin/i_dut/i_cluster_wrapper/i_cluster/gen_tiles[$1]/i_tile/gen_core[$2]/i_spatz_cc/i_snitch/lsu_rd +add wave -noupdate -group tile[$1]_core[$2] -group Snitch /tb_bin/i_dut/i_cluster_wrapper/i_cluster/gen_tiles[$1]/i_tile/gen_core[$2]/i_spatz_cc/i_snitch/retire_load +add wave -noupdate -group tile[$1]_core[$2] -group Snitch /tb_bin/i_dut/i_cluster_wrapper/i_cluster/gen_tiles[$1]/i_tile/gen_core[$2]/i_spatz_cc/i_snitch/retire_i +add wave -noupdate -group tile[$1]_core[$2] -group Snitch /tb_bin/i_dut/i_cluster_wrapper/i_cluster/gen_tiles[$1]/i_tile/gen_core[$2]/i_spatz_cc/i_snitch/retire_acc +add wave -noupdate -group tile[$1]_core[$2] -group Snitch -divider ALU +add wave -noupdate -group tile[$1]_core[$2] -group Snitch /tb_bin/i_dut/i_cluster_wrapper/i_cluster/gen_tiles[$1]/i_tile/gen_core[$2]/i_spatz_cc/i_snitch/opa +add wave -noupdate -group tile[$1]_core[$2] -group Snitch /tb_bin/i_dut/i_cluster_wrapper/i_cluster/gen_tiles[$1]/i_tile/gen_core[$2]/i_spatz_cc/i_snitch/opb +add wave -noupdate -group tile[$1]_core[$2] -group Snitch /tb_bin/i_dut/i_cluster_wrapper/i_cluster/gen_tiles[$1]/i_tile/gen_core[$2]/i_spatz_cc/i_snitch/iimm +add wave -noupdate -group tile[$1]_core[$2] -group Snitch /tb_bin/i_dut/i_cluster_wrapper/i_cluster/gen_tiles[$1]/i_tile/gen_core[$2]/i_spatz_cc/i_snitch/uimm +add wave -noupdate -group tile[$1]_core[$2] -group Snitch /tb_bin/i_dut/i_cluster_wrapper/i_cluster/gen_tiles[$1]/i_tile/gen_core[$2]/i_spatz_cc/i_snitch/jimm +add wave -noupdate -group tile[$1]_core[$2] -group Snitch /tb_bin/i_dut/i_cluster_wrapper/i_cluster/gen_tiles[$1]/i_tile/gen_core[$2]/i_spatz_cc/i_snitch/bimm +add wave -noupdate -group tile[$1]_core[$2] -group Snitch /tb_bin/i_dut/i_cluster_wrapper/i_cluster/gen_tiles[$1]/i_tile/gen_core[$2]/i_spatz_cc/i_snitch/simm +add wave -noupdate -group tile[$1]_core[$2] -group Snitch /tb_bin/i_dut/i_cluster_wrapper/i_cluster/gen_tiles[$1]/i_tile/gen_core[$2]/i_spatz_cc/i_snitch/adder_result +add wave -noupdate -group tile[$1]_core[$2] -group Snitch /tb_bin/i_dut/i_cluster_wrapper/i_cluster/gen_tiles[$1]/i_tile/gen_core[$2]/i_spatz_cc/i_snitch/alu_result +add wave -noupdate -group tile[$1]_core[$2] -group Snitch /tb_bin/i_dut/i_cluster_wrapper/i_cluster/gen_tiles[$1]/i_tile/gen_core[$2]/i_spatz_cc/i_snitch/rd +add wave -noupdate -group tile[$1]_core[$2] -group Snitch /tb_bin/i_dut/i_cluster_wrapper/i_cluster/gen_tiles[$1]/i_tile/gen_core[$2]/i_spatz_cc/i_snitch/rs1 +add wave -noupdate -group tile[$1]_core[$2] -group Snitch /tb_bin/i_dut/i_cluster_wrapper/i_cluster/gen_tiles[$1]/i_tile/gen_core[$2]/i_spatz_cc/i_snitch/rs2 +add wave -noupdate -group tile[$1]_core[$2] -group Snitch /tb_bin/i_dut/i_cluster_wrapper/i_cluster/gen_tiles[$1]/i_tile/gen_core[$2]/i_spatz_cc/i_snitch/gpr_raddr +add wave -noupdate -group tile[$1]_core[$2] -group Snitch /tb_bin/i_dut/i_cluster_wrapper/i_cluster/gen_tiles[$1]/i_tile/gen_core[$2]/i_spatz_cc/i_snitch/gpr_rdata +add wave -noupdate -group tile[$1]_core[$2] -group Snitch /tb_bin/i_dut/i_cluster_wrapper/i_cluster/gen_tiles[$1]/i_tile/gen_core[$2]/i_spatz_cc/i_snitch/gpr_waddr +add wave -noupdate -group tile[$1]_core[$2] -group Snitch /tb_bin/i_dut/i_cluster_wrapper/i_cluster/gen_tiles[$1]/i_tile/gen_core[$2]/i_spatz_cc/i_snitch/gpr_wdata +add wave -noupdate -group tile[$1]_core[$2] -group Snitch /tb_bin/i_dut/i_cluster_wrapper/i_cluster/gen_tiles[$1]/i_tile/gen_core[$2]/i_spatz_cc/i_snitch/gpr_we +add wave -noupdate -group tile[$1]_core[$2] -group Snitch /tb_bin/i_dut/i_cluster_wrapper/i_cluster/gen_tiles[$1]/i_tile/gen_core[$2]/i_spatz_cc/i_snitch/consec_pc +add wave -noupdate -group tile[$1]_core[$2] -group Snitch /tb_bin/i_dut/i_cluster_wrapper/i_cluster/gen_tiles[$1]/i_tile/gen_core[$2]/i_spatz_cc/i_snitch/sb_d +add wave -noupdate -group tile[$1]_core[$2] -group Snitch /tb_bin/i_dut/i_cluster_wrapper/i_cluster/gen_tiles[$1]/i_tile/gen_core[$2]/i_spatz_cc/i_snitch/sb_q +add wave -noupdate -group tile[$1]_core[$2] -group Snitch /tb_bin/i_dut/i_cluster_wrapper/i_cluster/gen_tiles[$1]/i_tile/gen_core[$2]/i_spatz_cc/i_snitch/is_load +add wave -noupdate -group tile[$1]_core[$2] -group Snitch /tb_bin/i_dut/i_cluster_wrapper/i_cluster/gen_tiles[$1]/i_tile/gen_core[$2]/i_spatz_cc/i_snitch/is_store +add wave -noupdate -group tile[$1]_core[$2] -group Snitch /tb_bin/i_dut/i_cluster_wrapper/i_cluster/gen_tiles[$1]/i_tile/gen_core[$2]/i_spatz_cc/i_snitch/is_signed +add wave -noupdate -group tile[$1]_core[$2] -group Snitch /tb_bin/i_dut/i_cluster_wrapper/i_cluster/gen_tiles[$1]/i_tile/gen_core[$2]/i_spatz_cc/i_snitch/ls_misaligned +add wave -noupdate -group tile[$1]_core[$2] -group Snitch /tb_bin/i_dut/i_cluster_wrapper/i_cluster/gen_tiles[$1]/i_tile/gen_core[$2]/i_spatz_cc/i_snitch/ld_addr_misaligned +add wave -noupdate -group tile[$1]_core[$2] -group Snitch /tb_bin/i_dut/i_cluster_wrapper/i_cluster/gen_tiles[$1]/i_tile/gen_core[$2]/i_spatz_cc/i_snitch/st_addr_misaligned +add wave -noupdate -group tile[$1]_core[$2] -group Snitch /tb_bin/i_dut/i_cluster_wrapper/i_cluster/gen_tiles[$1]/i_tile/gen_core[$2]/i_spatz_cc/i_snitch/valid_instr +add wave -noupdate -group tile[$1]_core[$2] -group Snitch /tb_bin/i_dut/i_cluster_wrapper/i_cluster/gen_tiles[$1]/i_tile/gen_core[$2]/i_spatz_cc/i_snitch/exception +add wave -noupdate -group tile[$1]_core[$2] -group Snitch /tb_bin/i_dut/i_cluster_wrapper/i_cluster/gen_tiles[$1]/i_tile/gen_core[$2]/i_spatz_cc/i_snitch/alu_op +add wave -noupdate -group tile[$1]_core[$2] -group Snitch /tb_bin/i_dut/i_cluster_wrapper/i_cluster/gen_tiles[$1]/i_tile/gen_core[$2]/i_spatz_cc/i_snitch/opa_select +add wave -noupdate -group tile[$1]_core[$2] -group Snitch /tb_bin/i_dut/i_cluster_wrapper/i_cluster/gen_tiles[$1]/i_tile/gen_core[$2]/i_spatz_cc/i_snitch/opb_select +add wave -noupdate -group tile[$1]_core[$2] -group Snitch /tb_bin/i_dut/i_cluster_wrapper/i_cluster/gen_tiles[$1]/i_tile/gen_core[$2]/i_spatz_cc/i_snitch/write_rd +add wave -noupdate -group tile[$1]_core[$2] -group Snitch /tb_bin/i_dut/i_cluster_wrapper/i_cluster/gen_tiles[$1]/i_tile/gen_core[$2]/i_spatz_cc/i_snitch/uses_rd +add wave -noupdate -group tile[$1]_core[$2] -group Snitch /tb_bin/i_dut/i_cluster_wrapper/i_cluster/gen_tiles[$1]/i_tile/gen_core[$2]/i_spatz_cc/i_snitch/next_pc +add wave -noupdate -group tile[$1]_core[$2] -group Snitch /tb_bin/i_dut/i_cluster_wrapper/i_cluster/gen_tiles[$1]/i_tile/gen_core[$2]/i_spatz_cc/i_snitch/rd_select +add wave -noupdate -group tile[$1]_core[$2] -group Snitch /tb_bin/i_dut/i_cluster_wrapper/i_cluster/gen_tiles[$1]/i_tile/gen_core[$2]/i_spatz_cc/i_snitch/rd_bypass +add wave -noupdate -group tile[$1]_core[$2] -group Snitch /tb_bin/i_dut/i_cluster_wrapper/i_cluster/gen_tiles[$1]/i_tile/gen_core[$2]/i_spatz_cc/i_snitch/is_branch +add wave -noupdate -group tile[$1]_core[$2] -group Snitch /tb_bin/i_dut/i_cluster_wrapper/i_cluster/gen_tiles[$1]/i_tile/gen_core[$2]/i_spatz_cc/i_snitch/csr_rvalue +add wave -noupdate -group tile[$1]_core[$2] -group Snitch /tb_bin/i_dut/i_cluster_wrapper/i_cluster/gen_tiles[$1]/i_tile/gen_core[$2]/i_spatz_cc/i_snitch/csr_en +add wave -noupdate -group tile[$1]_core[$2] -group Snitch /tb_bin/i_dut/i_cluster_wrapper/i_cluster/gen_tiles[$1]/i_tile/gen_core[$2]/i_spatz_cc/i_snitch/acc_register_rd +add wave -noupdate -group tile[$1]_core[$2] -group Snitch /tb_bin/i_dut/i_cluster_wrapper/i_cluster/gen_tiles[$1]/i_tile/gen_core[$2]/i_spatz_cc/i_snitch/operands_ready +add wave -noupdate -group tile[$1]_core[$2] -group Snitch /tb_bin/i_dut/i_cluster_wrapper/i_cluster/gen_tiles[$1]/i_tile/gen_core[$2]/i_spatz_cc/i_snitch/dst_ready +add wave -noupdate -group tile[$1]_core[$2] -group Snitch /tb_bin/i_dut/i_cluster_wrapper/i_cluster/gen_tiles[$1]/i_tile/gen_core[$2]/i_spatz_cc/i_snitch/opa_ready +add wave -noupdate -group tile[$1]_core[$2] -group Snitch /tb_bin/i_dut/i_cluster_wrapper/i_cluster/gen_tiles[$1]/i_tile/gen_core[$2]/i_spatz_cc/i_snitch/opb_ready +add wave -noupdate -group tile[$1]_core[$2] -group Snitch /tb_bin/i_dut/i_cluster_wrapper/i_cluster/gen_tiles[$1]/i_tile/gen_core[$2]/i_spatz_cc/i_snitch/shift_opa +add wave -noupdate -group tile[$1]_core[$2] -group Snitch /tb_bin/i_dut/i_cluster_wrapper/i_cluster/gen_tiles[$1]/i_tile/gen_core[$2]/i_spatz_cc/i_snitch/shift_opa_reversed +add wave -noupdate -group tile[$1]_core[$2] -group Snitch /tb_bin/i_dut/i_cluster_wrapper/i_cluster/gen_tiles[$1]/i_tile/gen_core[$2]/i_spatz_cc/i_snitch/shift_right_result +add wave -noupdate -group tile[$1]_core[$2] -group Snitch /tb_bin/i_dut/i_cluster_wrapper/i_cluster/gen_tiles[$1]/i_tile/gen_core[$2]/i_spatz_cc/i_snitch/shift_left_result +add wave -noupdate -group tile[$1]_core[$2] -group Snitch /tb_bin/i_dut/i_cluster_wrapper/i_cluster/gen_tiles[$1]/i_tile/gen_core[$2]/i_spatz_cc/i_snitch/shift_opa_ext +add wave -noupdate -group tile[$1]_core[$2] -group Snitch /tb_bin/i_dut/i_cluster_wrapper/i_cluster/gen_tiles[$1]/i_tile/gen_core[$2]/i_spatz_cc/i_snitch/shift_right_result_ext +add wave -noupdate -group tile[$1]_core[$2] -group Snitch /tb_bin/i_dut/i_cluster_wrapper/i_cluster/gen_tiles[$1]/i_tile/gen_core[$2]/i_spatz_cc/i_snitch/shift_left +add wave -noupdate -group tile[$1]_core[$2] -group Snitch /tb_bin/i_dut/i_cluster_wrapper/i_cluster/gen_tiles[$1]/i_tile/gen_core[$2]/i_spatz_cc/i_snitch/shift_arithmetic +add wave -noupdate -group tile[$1]_core[$2] -group Snitch /tb_bin/i_dut/i_cluster_wrapper/i_cluster/gen_tiles[$1]/i_tile/gen_core[$2]/i_spatz_cc/i_snitch/alu_opa +add wave -noupdate -group tile[$1]_core[$2] -group Snitch /tb_bin/i_dut/i_cluster_wrapper/i_cluster/gen_tiles[$1]/i_tile/gen_core[$2]/i_spatz_cc/i_snitch/alu_opb +add wave -noupdate -group tile[$1]_core[$2] -group Snitch /tb_bin/i_dut/i_cluster_wrapper/i_cluster/gen_tiles[$1]/i_tile/gen_core[$2]/i_spatz_cc/i_snitch/alu_writeback +add wave -noupdate -group tile[$1]_core[$2] -group Snitch /tb_bin/i_dut/i_cluster_wrapper/i_cluster/gen_tiles[$1]/i_tile/gen_core[$2]/i_spatz_cc/i_snitch/acc_mem_cnt_d +add wave -noupdate -group tile[$1]_core[$2] -group Snitch /tb_bin/i_dut/i_cluster_wrapper/i_cluster/gen_tiles[$1]/i_tile/gen_core[$2]/i_spatz_cc/i_snitch/acc_mem_cnt_q +add wave -noupdate -group tile[$1]_core[$2] -group Snitch /tb_bin/i_dut/i_cluster_wrapper/i_cluster/gen_tiles[$1]/i_tile/gen_core[$2]/i_spatz_cc/i_snitch/acc_mem_str_cnt_d +add wave -noupdate -group tile[$1]_core[$2] -group Snitch /tb_bin/i_dut/i_cluster_wrapper/i_cluster/gen_tiles[$1]/i_tile/gen_core[$2]/i_spatz_cc/i_snitch/acc_mem_str_cnt_q +add wave -noupdate -group tile[$1]_core[$2] -group Snitch /tb_bin/i_dut/i_cluster_wrapper/i_cluster/gen_tiles[$1]/i_tile/gen_core[$2]/i_spatz_cc/i_snitch/core_events_o + +add wave -noupdate -group tile[$1]_core[$2] -group Snitch -group Internal -group RF /tb_bin/i_dut/i_cluster_wrapper/i_cluster/gen_tiles[$1]/i_tile/gen_core[$2]/i_spatz_cc/i_snitch/i_snitch_regfile/* +add wave -noupdate -group tile[$1]_core[$2] -group Snitch -group Internal /tb_bin/i_dut/i_cluster_wrapper/i_cluster/gen_tiles[$1]/i_tile/gen_core[$2]/i_spatz_cc/i_snitch/* + +add wave -noupdate -group tile[$1]_core[$2] -group Spatz /tb_bin/i_dut/i_cluster_wrapper/i_cluster/gen_tiles[$1]/i_tile/gen_core[$2]/i_spatz_cc/i_spatz/issue_valid_i +add wave -noupdate -group tile[$1]_core[$2] -group Spatz /tb_bin/i_dut/i_cluster_wrapper/i_cluster/gen_tiles[$1]/i_tile/gen_core[$2]/i_spatz_cc/i_spatz/issue_ready_o +add wave -noupdate -group tile[$1]_core[$2] -group Spatz /tb_bin/i_dut/i_cluster_wrapper/i_cluster/gen_tiles[$1]/i_tile/gen_core[$2]/i_spatz_cc/i_spatz/issue_req_i +add wave -noupdate -group tile[$1]_core[$2] -group Spatz /tb_bin/i_dut/i_cluster_wrapper/i_cluster/gen_tiles[$1]/i_tile/gen_core[$2]/i_spatz_cc/i_spatz/issue_rsp_o +add wave -noupdate -group tile[$1]_core[$2] -group Spatz /tb_bin/i_dut/i_cluster_wrapper/i_cluster/gen_tiles[$1]/i_tile/gen_core[$2]/i_spatz_cc/i_spatz/rsp_valid_o +add wave -noupdate -group tile[$1]_core[$2] -group Spatz /tb_bin/i_dut/i_cluster_wrapper/i_cluster/gen_tiles[$1]/i_tile/gen_core[$2]/i_spatz_cc/i_spatz/rsp_ready_i +add wave -noupdate -group tile[$1]_core[$2] -group Spatz /tb_bin/i_dut/i_cluster_wrapper/i_cluster/gen_tiles[$1]/i_tile/gen_core[$2]/i_spatz_cc/i_spatz/rsp_o +add wave -noupdate -group tile[$1]_core[$2] -group Spatz /tb_bin/i_dut/i_cluster_wrapper/i_cluster/gen_tiles[$1]/i_tile/gen_core[$2]/i_spatz_cc/i_spatz/spatz_mem_req_o +add wave -noupdate -group tile[$1]_core[$2] -group Spatz /tb_bin/i_dut/i_cluster_wrapper/i_cluster/gen_tiles[$1]/i_tile/gen_core[$2]/i_spatz_cc/i_spatz/spatz_mem_req_valid_o +add wave -noupdate -group tile[$1]_core[$2] -group Spatz /tb_bin/i_dut/i_cluster_wrapper/i_cluster/gen_tiles[$1]/i_tile/gen_core[$2]/i_spatz_cc/i_spatz/spatz_mem_req_ready_i +add wave -noupdate -group tile[$1]_core[$2] -group Spatz /tb_bin/i_dut/i_cluster_wrapper/i_cluster/gen_tiles[$1]/i_tile/gen_core[$2]/i_spatz_cc/i_spatz/spatz_mem_rsp_i +add wave -noupdate -group tile[$1]_core[$2] -group Spatz /tb_bin/i_dut/i_cluster_wrapper/i_cluster/gen_tiles[$1]/i_tile/gen_core[$2]/i_spatz_cc/i_spatz/spatz_mem_rsp_valid_i + +add wave -noupdate -group tile[$1]_core[$2] -group Spatz -group "FPU Sequencer" /tb_bin/i_dut/i_cluster_wrapper/i_cluster/gen_tiles[$1]/i_tile/gen_core[$2]/i_spatz_cc/i_spatz/gen_fpu_sequencer/i_fpu_sequencer/* +add wave -noupdate -group tile[$1]_core[$2] -group Spatz -group "FPU Sequencer" -group FPR /tb_bin/i_dut/i_cluster_wrapper/i_cluster/gen_tiles[$1]/i_tile/gen_core[$2]/i_spatz_cc/i_spatz/gen_fpu_sequencer/i_fpu_sequencer/i_fpr/* +add wave -noupdate -group tile[$1]_core[$2] -group Spatz -group "FPU Sequencer" -group LSU /tb_bin/i_dut/i_cluster_wrapper/i_cluster/gen_tiles[$1]/i_tile/gen_core[$2]/i_spatz_cc/i_spatz/gen_fpu_sequencer/i_fpu_sequencer/i_fp_lsu/* + +add wave -noupdate -group tile[$1]_core[$2] -group Spatz -group Controller /tb_bin/i_dut/i_cluster_wrapper/i_cluster/gen_tiles[$1]/i_tile/gen_core[$2]/i_spatz_cc/i_spatz/i_controller/* + +add wave -noupdate -group tile[$1]_core[$2] -group Spatz -group VRF -divider RegisterWrite +add wave -noupdate -group tile[$1]_core[$2] -group Spatz -group VRF /tb_bin/i_dut/i_cluster_wrapper/i_cluster/gen_tiles[$1]/i_tile/gen_core[$2]/i_spatz_cc/i_spatz/i_vrf/waddr_i +add wave -noupdate -group tile[$1]_core[$2] -group Spatz -group VRF /tb_bin/i_dut/i_cluster_wrapper/i_cluster/gen_tiles[$1]/i_tile/gen_core[$2]/i_spatz_cc/i_spatz/i_vrf/wdata_i +add wave -noupdate -group tile[$1]_core[$2] -group Spatz -group VRF /tb_bin/i_dut/i_cluster_wrapper/i_cluster/gen_tiles[$1]/i_tile/gen_core[$2]/i_spatz_cc/i_spatz/i_vrf/we_i +add wave -noupdate -group tile[$1]_core[$2] -group Spatz -group VRF /tb_bin/i_dut/i_cluster_wrapper/i_cluster/gen_tiles[$1]/i_tile/gen_core[$2]/i_spatz_cc/i_spatz/i_vrf/wbe_i +add wave -noupdate -group tile[$1]_core[$2] -group Spatz -group VRF /tb_bin/i_dut/i_cluster_wrapper/i_cluster/gen_tiles[$1]/i_tile/gen_core[$2]/i_spatz_cc/i_spatz/i_vrf/wvalid_o +add wave -noupdate -group tile[$1]_core[$2] -group Spatz -group VRF -divider RegisterRead +add wave -noupdate -group tile[$1]_core[$2] -group Spatz -group VRF /tb_bin/i_dut/i_cluster_wrapper/i_cluster/gen_tiles[$1]/i_tile/gen_core[$2]/i_spatz_cc/i_spatz/i_vrf/raddr_i +add wave -noupdate -group tile[$1]_core[$2] -group Spatz -group VRF /tb_bin/i_dut/i_cluster_wrapper/i_cluster/gen_tiles[$1]/i_tile/gen_core[$2]/i_spatz_cc/i_spatz/i_vrf/rdata_o +add wave -noupdate -group tile[$1]_core[$2] -group Spatz -group VRF /tb_bin/i_dut/i_cluster_wrapper/i_cluster/gen_tiles[$1]/i_tile/gen_core[$2]/i_spatz_cc/i_spatz/i_vrf/re_i +add wave -noupdate -group tile[$1]_core[$2] -group Spatz -group VRF /tb_bin/i_dut/i_cluster_wrapper/i_cluster/gen_tiles[$1]/i_tile/gen_core[$2]/i_spatz_cc/i_spatz/i_vrf/rvalid_o +add wave -noupdate -group tile[$1]_core[$2] -group Spatz -group VRF -divider Internal +add wave -noupdate -group tile[$1]_core[$2] -group Spatz -group VRF /tb_bin/i_dut/i_cluster_wrapper/i_cluster/gen_tiles[$1]/i_tile/gen_core[$2]/i_spatz_cc/i_spatz/i_vrf/waddr +add wave -noupdate -group tile[$1]_core[$2] -group Spatz -group VRF /tb_bin/i_dut/i_cluster_wrapper/i_cluster/gen_tiles[$1]/i_tile/gen_core[$2]/i_spatz_cc/i_spatz/i_vrf/wdata +add wave -noupdate -group tile[$1]_core[$2] -group Spatz -group VRF /tb_bin/i_dut/i_cluster_wrapper/i_cluster/gen_tiles[$1]/i_tile/gen_core[$2]/i_spatz_cc/i_spatz/i_vrf/we +add wave -noupdate -group tile[$1]_core[$2] -group Spatz -group VRF /tb_bin/i_dut/i_cluster_wrapper/i_cluster/gen_tiles[$1]/i_tile/gen_core[$2]/i_spatz_cc/i_spatz/i_vrf/wbe +add wave -noupdate -group tile[$1]_core[$2] -group Spatz -group VRF /tb_bin/i_dut/i_cluster_wrapper/i_cluster/gen_tiles[$1]/i_tile/gen_core[$2]/i_spatz_cc/i_spatz/i_vrf/raddr +add wave -noupdate -group tile[$1]_core[$2] -group Spatz -group VRF /tb_bin/i_dut/i_cluster_wrapper/i_cluster/gen_tiles[$1]/i_tile/gen_core[$2]/i_spatz_cc/i_spatz/i_vrf/rdata + +add wave -noupdate -group tile[$1]_core[$2] -group Spatz -group VLSU /tb_bin/i_dut/i_cluster_wrapper/i_cluster/gen_tiles[$1]/i_tile/gen_core[$2]/i_spatz_cc/i_spatz/i_vlsu/* + +add wave -noupdate -group tile[$1]_core[$2] -group Spatz -group VSLDU /tb_bin/i_dut/i_cluster_wrapper/i_cluster/gen_tiles[$1]/i_tile/gen_core[$2]/i_spatz_cc/i_spatz/i_vsldu/* + +add wave -noupdate -group tile[$1]_core[$2] -group Spatz -group VFU /tb_bin/i_dut/i_cluster_wrapper/i_cluster/gen_tiles[$1]/i_tile/gen_core[$2]/i_spatz_cc/i_spatz/i_vfu/* + +add wave -noupdate -group tile[$1]_core[$2] -group Spatz -group MXU /tb_bin/i_dut/i_cluster_wrapper/i_cluster/gen_tiles[$1]/i_tile/gen_core[$2]/i_spatz_cc/i_spatz/i_vfu/i_mxu/* + +add wave -noupdate -group tile[$1]_core[$2] -group Spatz -group FPU /tb_bin/i_dut/i_cluster_wrapper/i_cluster/gen_tiles[$1]/i_tile/gen_core[$2]/i_spatz_cc/i_spatz/i_vfu/gen_fpu/* + +add wave -noupdate -group tile[$1]_core[$2] -group Internal /tb_bin/i_dut/i_cluster_wrapper/i_cluster/gen_tiles[$1]/i_tile/gen_core[$2]/i_spatz_cc/* + +add wave -noupdate -group tile[$1]_core[$2] -group Spatz -group VLSU -group ROB0 /tb_bin/i_dut/i_cluster_wrapper/i_cluster/gen_tiles[$1]/i_tile/gen_core[$2]/i_spatz_cc/i_spatz/i_vlsu/gen_rob[0]/i_reorder_buffer/* +add wave -noupdate -group tile[$1]_core[$2] -group Spatz -group VLSU -group ROB1 /tb_bin/i_dut/i_cluster_wrapper/i_cluster/gen_tiles[$1]/i_tile/gen_core[$2]/i_spatz_cc/i_spatz/i_vlsu/gen_rob[1]/i_reorder_buffer/* + +add wave -noupdate -group tile[$1]_core[$2] -group Spatz -group RSP_FIFO /tb_bin/i_dut/i_cluster_wrapper/i_cluster/gen_tiles[$1]/i_tile/gen_core[$2]/i_spatz_cc/spatz_mem_rsp +add wave -noupdate -group tile[$1]_core[$2] -group Spatz -group RSP_FIFO /tb_bin/i_dut/i_cluster_wrapper/i_cluster/gen_tiles[$1]/i_tile/gen_core[$2]/i_spatz_cc/spatz_mem_fifo +add wave -noupdate -group tile[$1]_core[$2] -group Spatz -group RSP_FIFO /tb_bin/i_dut/i_cluster_wrapper/i_cluster/gen_tiles[$1]/i_tile/gen_core[$2]/i_spatz_cc/spatz_mem_rsp_valid +add wave -noupdate -group tile[$1]_core[$2] -group Spatz -group RSP_FIFO /tb_bin/i_dut/i_cluster_wrapper/i_cluster/gen_tiles[$1]/i_tile/gen_core[$2]/i_spatz_cc/spatz_mem_rsp_ready +add wave -noupdate -group tile[$1]_core[$2] -group Spatz -group RSP_FIFO /tb_bin/i_dut/i_cluster_wrapper/i_cluster/gen_tiles[$1]/i_tile/gen_core[$2]/i_spatz_cc/spatz_mem_rsp_empty +add wave -noupdate -group tile[$1]_core[$2] -group Spatz -group RSP_FIFO /tb_bin/i_dut/i_cluster_wrapper/i_cluster/gen_tiles[$1]/i_tile/gen_core[$2]/i_spatz_cc/spatz_mem_rsp_pop +add wave -noupdate -group tile[$1]_core[$2] -group Spatz -group RSP_FIFO /tb_bin/i_dut/i_cluster_wrapper/i_cluster/gen_tiles[$1]/i_tile/gen_core[$2]/i_spatz_cc/spatz_mem_rsp_push +add wave -noupdate -group tile[$1]_core[$2] -group Spatz -group RSP_FIFO /tb_bin/i_dut/i_cluster_wrapper/i_cluster/gen_tiles[$1]/i_tile/gen_core[$2]/i_spatz_cc/spatz_mem_fifo_bypass diff --git a/sim/scripts/vsim_tile.tcl b/sim/scripts/vsim_tile.tcl new file mode 100644 index 0000000..b33ad9f --- /dev/null +++ b/sim/scripts/vsim_tile.tcl @@ -0,0 +1,32 @@ +# Copyright 2021 ETH Zurich and University of Bologna. +# Solderpad Hardware License, Version 0.51, see LICENSE for details. +# SPDX-License-Identifier: SHL-0.51 + +# Create group for Tile $1 +onerror {resume} + +# Add waves for tcdm_mapper and csrs +add wave -noupdate -group tile[$1] -group Mapper /tb_bin/i_dut/i_cluster_wrapper/i_cluster/gen_tiles[$1]/i_tile/i_tcdm_mapper/* +add wave -noupdate -group tile[$1] -group CSR /tb_bin/i_dut/i_cluster_wrapper/i_cluster/gen_tiles[$1]/i_tile/i_snitch_cluster_peripheral/* + +# Add waves for xbars +add wave -noupdate -group tile[$1] -group core_xbar /tb_bin/i_dut/i_cluster_wrapper/i_cluster/gen_tiles[$1]/i_tile/i_tcdm_interconnect/* +add wave -noupdate -group tile[$1] -group core_xbar -group req /tb_bin/i_dut/i_cluster_wrapper/i_cluster/gen_tiles[$1]/i_tile/i_tcdm_interconnect/gen_xbar/i_stream_xbar/* +add wave -noupdate -group tile[$1] -group dma_xbar /tb_bin/i_dut/i_cluster_wrapper/i_cluster/gen_tiles[$1]/i_tile/i_dma_interconnect/* + +# Add waces for cache controller +for {set c 0} {$c < 4} {incr c} { + add wave -noupdate -group tile[$1] -group cache[$c] -group controller -group coalescer /tb_bin/i_dut/i_cluster_wrapper/i_cluster/gen_tiles[$1]/i_tile/gen_l1_cache_ctrl[$c]/i_l1_controller/i_par_coalescer_for_spatz/gen_extend_window/i_par_coalescer_extend_window/i_par_coalescer/* + add wave -noupdate -group tile[$1] -group cache[$c] -group controller -group core /tb_bin/i_dut/i_cluster_wrapper/i_cluster/gen_tiles[$1]/i_tile/gen_l1_cache_ctrl[$c]/i_l1_controller/i_insitu_cache_tcdm_wrapper/i_insitu_cache_tcdm_wrapper/i_insitu_cache_core/* + add wave -noupdate -group tile[$1] -group cache[$c] -group controller /tb_bin/i_dut/i_cluster_wrapper/i_cluster/gen_tiles[$1]/i_tile/gen_l1_cache_ctrl[$c]/i_l1_controller/* + + add wave -noupdate -group tile[$1] -group cache[$c] -group xbar -group req /tb_bin/i_dut/i_cluster_wrapper/i_cluster/gen_tiles[$1]/i_tile/gen_cache_xbar[$c]/i_cache_xbar/i_req_xbar/* + add wave -noupdate -group tile[$1] -group cache[$c] -group xbar -group rsp /tb_bin/i_dut/i_cluster_wrapper/i_cluster/gen_tiles[$1]/i_tile/gen_cache_xbar[$c]/i_cache_xbar/i_rsp_xbar/* + add wave -noupdate -group tile[$1] -group cache[$c] -group xbar /tb_bin/i_dut/i_cluster_wrapper/i_cluster/gen_tiles[$1]/i_tile/gen_cache_xbar[$c]/i_cache_xbar/* +} + +# Add waves for atomic units +add wave -noupdate -group tile[$1] -group amo0_4 /tb_bin/i_dut/i_cluster_wrapper/i_cluster/gen_tiles[$1]/i_tile/gen_cache_connect[0]/gen_cache_amo[4]/i_cache_amo/* + +# Add waves for remaining signals +add wave -noupdate -group tile[$1] /tb_bin/i_dut/i_cluster_wrapper/i_cluster/gen_tiles[$1]/i_tile/* diff --git a/sim/scripts/vsim_wave.tcl b/sim/scripts/vsim_wave.tcl new file mode 100644 index 0000000..a2e0c75 --- /dev/null +++ b/sim/scripts/vsim_wave.tcl @@ -0,0 +1,19 @@ +# Copyright 2021 ETH Zurich and University of Bologna. +# Solderpad Hardware License, Version 0.51, see LICENSE for details. +# SPDX-License-Identifier: SHL-0.51 + +onerror {resume} +quietly WaveActivateNextPane {} 0 + +# Add the cluster probe +add wave /tb_bin/i_dut/cluster_probe + +# Add cluster waves +add wave -noupdate -group Cluster /tb_bin/i_dut/i_cluster_wrapper/i_cluster/* + +do sim/scripts/vsim_tile.tcl 0 + +# Add all cores in Tile 0 +for {set core 0} {$core < 4} {incr core} { + do sim/scripts/vsim_core.tcl 0 $core +} diff --git a/software/.gitignore b/software/.gitignore new file mode 100644 index 0000000..481c9d6 --- /dev/null +++ b/software/.gitignore @@ -0,0 +1,2 @@ +build +toolchain diff --git a/software/CMakeLists.txt b/software/CMakeLists.txt new file mode 100644 index 0000000..858cdcb --- /dev/null +++ b/software/CMakeLists.txt @@ -0,0 +1,24 @@ +# Copyright 2020 ETH Zurich and University of Bologna. +# Licensed under the Apache License, Version 2.0, see LICENSE for details. +# SPDX-License-Identifier: Apache-2.0 + +cmake_minimum_required(VERSION 3.13) + +set(SNITCH_SOFTWARE_DIR ${RUNTIME_DIR}) +set(SPATZ_SOFTWARE_DIR ${SPATZ_SW_DIR}) + +list(APPEND CMAKE_MODULE_PATH ${SNITCH_SOFTWARE_DIR}/cmake) +set(CMAKE_TOOLCHAIN_FILE toolchain-llvm) +set(SNITCH_RUNTIME snRuntime-cluster CACHE STRING "") + +project(snitch_cluster LANGUAGES C ASM) +include(SnitchUtilities) + +enable_testing() +add_subdirectory(${SNITCH_SOFTWARE_DIR}/snRuntime snRuntime) +# add_subdirectory(${SPATZ_SOFTWARE_DIR}/riscvTests riscvTests) +# add_subdirectory(${SPATZ_SOFTWARE_DIR}/spatzBenchmarks spatzBenchmarks) + + +add_subdirectory(${CACHEPOOL_DIR}/software/tests CachePoolTests) + diff --git a/software/cmake/SnitchUtilities.cmake b/software/cmake/SnitchUtilities.cmake new file mode 100644 index 0000000..ec96cf4 --- /dev/null +++ b/software/cmake/SnitchUtilities.cmake @@ -0,0 +1,74 @@ +# Copyright 2020 ETH Zurich and University of Bologna. +# Solderpad Hardware License, Version 0.51, see LICENSE for details. +# SPDX-License-Identifier: SHL-0.51 + +set(SNITCH_RUNTIME "snRuntime-cluster" CACHE STRING "Target name of the snRuntime flavor to link against") +set(SNITCH_SIMULATOR "" CACHE PATH "Command to run a binary in an RTL simulation") +set(SIMULATOR_TIMEOUT "1800" CACHE STRING "Timeout when running tests on RTL simulation") +set(SPIKE_DASM "spike-dasm" CACHE PATH "Path to the spike-dasm for generating traces") +set(LLVM_PATH "/home/spatz" CACH PATH "Path to the LLVM RISCV installation") +set(GCC_PATH "/home/spatz" CACHE PATH "Path to the GCC RISCV installation") +set(RUNTIME_TRACE OFF CACHE BOOL "Enable runtime trace output") +set(SNITCH_TEST_PREFIX "") +if (SNITCH_SIMULATOR) + message(STATUS "Using RTL simulator: ${SNITCH_SIMULATOR}") +endif() +message(STATUS "Using runtime: ${SNITCH_RUNTIME}") + +# Toolchain to use +set(CMAKE_TOOLCHAIN_FILE toolchain-llvm CACHE STRING "Toolchain to use") + +# Select to build the tests +set(BUILD_TESTS OFF CACHE BOOL "Build test executables") + +macro(add_snitch_library name) + add_library(${ARGV}) + add_custom_command( + TARGET ${name} + POST_BUILD + COMMAND ${CMAKE_OBJDUMP} -dhS $ > $.s) +endmacro() + +macro(add_snitch_executable name) + add_executable(${ARGV}) + target_link_libraries(${name} ${SNITCH_RUNTIME}) + target_link_options(${name} PRIVATE "SHELL:-T ${LINKER_SCRIPT}") + add_custom_command( + TARGET ${name} + POST_BUILD + COMMAND ${CMAKE_OBJDUMP} -dhS $ > $.s) + # Run target for RTL simulator + if (SNITCH_SIMULATOR AND SNITCH_RUNTIME STREQUAL "snRuntime-cluster") + add_custom_target(run-rtl-${name} + COMMAND ${SNITCH_SIMULATOR} $ + COMMAND for f in logs/trace_hart_*.dasm\; do ${SPIKE_DASM} < $$f | ${PYTHON} ${SNRUNTIME_SRC_DIR}/../../util/gen_trace.py > $$\(echo $$f | sed 's/\\.dasm/\\.txt/'\)\; done + DEPENDS $) + endif() +endmacro() + +macro(add_snitch_test_executable name) + if (BUILD_TESTS) + add_snitch_executable(test-${SNITCH_TEST_PREFIX}${name} ${ARGN}) + endif() +endmacro() + +macro(add_snitch_raw_test_rtl test_name target_name) + add_test(NAME ${SNITCH_TEST_PREFIX}rtl-${test_name} COMMAND ${SNITCH_SIMULATOR} $) + set_property(TEST ${SNITCH_TEST_PREFIX}rtl-${test_name} + PROPERTY LABELS ${SNITCH_TEST_PREFIX}) + set_tests_properties(${SNITCH_TEST_PREFIX}rtl-${test_name} PROPERTIES TIMEOUT ${SIMULATOR_TIMEOUT}) + set_tests_properties(${SNITCH_TEST_PREFIX}rtl-${test_name} PROPERTIES PASS_REGULAR_EXPRESSION "SUCCESS;PASS") + set_tests_properties(${SNITCH_TEST_PREFIX}rtl-${test_name} PROPERTIES FAIL_REGULAR_EXPRESSION "FAILURE") +endmacro() + +macro(add_snitch_test_rtl name) + add_snitch_raw_test_rtl(${SNITCH_TEST_PREFIX}rtl-${name} test-${SNITCH_TEST_PREFIX}${name}) +endmacro() + +macro(add_snitch_test name) + if (BUILD_TESTS) + message(STATUS "Adding test: ${name}") + add_snitch_test_executable(${ARGV}) + add_snitch_test_rtl(${name}) + endif() +endmacro() diff --git a/software/cmake/toolchain-llvm.cmake b/software/cmake/toolchain-llvm.cmake new file mode 100644 index 0000000..55c3075 --- /dev/null +++ b/software/cmake/toolchain-llvm.cmake @@ -0,0 +1,46 @@ +# Copyright 2020 ETH Zurich and University of Bologna. +# Solderpad Hardware License, Version 0.51, see LICENSE for details. +# SPDX-License-Identifier: SHL-0.51 + +# Look for the precompiled binaries +set(CMAKE_C_COMPILER ${LLVM_PATH}/bin/clang) +set(CMAKE_CXX_COMPILER ${LLVM_PATH}/bin/clang++) +set(CMAKE_OBJCOPY ${LLVM_PATH}/bin/llvm-objcopy) +set(CMAKE_OBJDUMP ${LLVM_PATH}/bin/llvm-objdump --mcpu=snitch --mattr=a --mattr=v --mattr=m --mattr=zfh) +set(CMAKE_AR ${LLVM_PATH}/bin/llvm-ar) +set(CMAKE_STRIP ${LLVM_PATH}/bin/llvm-strip) +set(CMAKE_RANLIB ${LLVM_PATH}/bin/llvm-ranlib) + +## +## Compile options +## +add_compile_options(-mcpu=snitch -mcmodel=small -ffast-math -fno-builtin-printf -fno-common -falign-loops=16) +add_compile_options(-ffunction-sections) +add_compile_options(-Wextra) +add_compile_options(-static) +add_compile_options(-mllvm -misched-topdown) +# For smallfloat we need experimental extensions enabled (Zfh) +add_compile_options(-menable-experimental-extensions) +# LLD doesn't support relaxation for RISC-V yet +add_compile_options(-mno-relax) +# Set the ISA and ABI +add_compile_options(-march=rv32imafdvzfh_xdma_xfquarter -mabi=ilp32d) +# Set the GCC path +set(CMAKE_C_FLAGS "${CMAKE_C_FLAGS} --gcc-toolchain=${GCC_PATH}") + +## +## Link options +## +add_link_options(-mcpu=snitch -static -mcmodel=small -fuse-ld=lld) +add_link_options(-nostartfiles) +add_link_options(-march=rv32imafdvzfh_xdma -mabi=ilp32d) +add_link_options(-ffast-math -fno-common -fno-builtin-printf) + +link_libraries(-lm) +link_libraries(-lgcc) + +# LLD defaults to -z relro which we don't want in a static ELF +add_link_options(-Wl,-z,norelro) +add_link_options(-Wl,--gc-sections) +add_link_options(-Wl,--no-relax) +#add_link_options(-Wl,--verbose) diff --git a/software/snRuntime/.gitignore b/software/snRuntime/.gitignore new file mode 100644 index 0000000..796b96d --- /dev/null +++ b/software/snRuntime/.gitignore @@ -0,0 +1 @@ +/build diff --git a/software/snRuntime/CMakeLists.txt b/software/snRuntime/CMakeLists.txt new file mode 100644 index 0000000..770d955 --- /dev/null +++ b/software/snRuntime/CMakeLists.txt @@ -0,0 +1,139 @@ +# Copyright 2020 ETH Zurich and University of Bologna. +# Licensed under the Apache License, Version 2.0, see LICENSE for details. +# SPDX-License-Identifier: Apache-2.0 + +cmake_minimum_required(VERSION 3.13) + +# Allow snRuntime to be built as a standalone library. +if (CMAKE_SOURCE_DIR STREQUAL CMAKE_CURRENT_SOURCE_DIR) + # Read SnitchUtilities + list(APPEND CMAKE_MODULE_PATH ${CMAKE_CURRENT_SOURCE_DIR}/../cmake) + include(SnitchUtilities) + # Create snRuntime project + project(snRuntime LANGUAGES C ASM) +else() + # Export package information to includer. + set(SNRUNTIME_DIR ${CMAKE_CURRENT_BINARY_DIR} PARENT_SCOPE) + set(SNRUNTIME_SRC_DIR ${CMAKE_CURRENT_SOURCE_DIR} PARENT_SCOPE) + set(SNRUNTIME_INCLUDE_DIRS + ${CMAKE_CURRENT_SOURCE_DIR}/include + ${CMAKE_CURRENT_SOURCE_DIR}/vendor + ${CMAKE_CURRENT_SOURCE_DIR}/../toolchain/riscv-opcodes + PARENT_SCOPE) +endif() + +add_compile_options(-O3 -g -ffunction-sections) + +# Default memory regions +if(SNITCH_RUNTIME STREQUAL "snRuntime-cluster") + set(MEM_DRAM_ORIGIN "0x80000000" CACHE STRING "Base address of external memory") + set(MEM_DRAM_SIZE "0x80000000" CACHE STRING "Size of external memory") + set(L2_ORIGIN "0x51800000" CACHE STRING "Base address of L2 memory") + set(L2_SIZE "0x800000" CACHE STRING "Size of L2 memory") + set(UNCACHED_REGION_ORIGIN "0x52000000" CACHE STRING "Base address of UNCACHED_REGION memory") + set(UNCACHED_REGION_SIZE "0x800000" CACHE STRING "Size of UNCACHED_REGION memory") +else() + set(MEM_DRAM_ORIGIN "0x80000000" CACHE STRING "Base address of external memory") + set(MEM_DRAM_SIZE "256M" CACHE STRING "Size of external memory") +endif() +configure_file(${CMAKE_CURRENT_SOURCE_DIR}/link/common.ld.in common.ld @ONLY) +set(LINKER_SCRIPT ${CMAKE_CURRENT_BINARY_DIR}/common.ld CACHE PATH "") + +# provide linker script +# set(LINKER_SCRIPT "${CMAKE_CURRENT_SOURCE_DIR}/link/common.ld" CACHE PATH "") +message(STATUS "Using common linker script: ${LINKER_SCRIPT}") + +# OpenMP +set(OMPSTATIC_NUMTHREADS "0" CACHE STRING "If set to a non-zero value the OpenMP runtime is optimized to the number of cores") + +if(RUNTIME_TRACE) + # Enable runtime tracing + add_compile_definitions(__SNRT_USE_TRACE) +endif() + +include_directories( + include + vendor + ../toolchain/riscv-opcodes +) + +# Common sources +set(sources + src/barrier.c + src/dma.c + src/memcpy.c + src/printf.c + src/team.c + src/alloc.c + src/interrupt.c + src/perf_cnt.c + src/l1cache.c +) + +# platform specific sources +set(standalone_snitch_sources + src/platforms/standalone/start_snitch.S + src/platforms/standalone/putchar.c +) + +# Sources only compatible with the LLVM toolchain +if (CMAKE_C_COMPILER_ID STREQUAL "Clang") + set(sources + ${sources} + # OpenMP support + src/omp/omp.c + src/omp/kmp.c + src/omp/eu.c + src/dm.c + ) + # Check if static OpenMP runtime is requested + if(OMPSTATIC_NUMTHREADS GREATER 0) + message(STATUS "Using ${OMPSTATIC_NUMTHREADS} threads for optimized OpenMP runtime") + add_compile_definitions(OMPSTATIC_NUMTHREADS=${OMPSTATIC_NUMTHREADS}) + else() + message(STATUS "Generic OpenMP runtime") + endif() +endif() + +if (CMAKE_SOURCE_DIR STREQUAL CMAKE_CURRENT_SOURCE_DIR) + # Build all runtimes + + # Generic Runtime (requiring bootloader/OS) + add_snitch_library(snRuntime src/start.S ${sources}) + + # Bare Runtimes (with startup code) + add_snitch_library(snRuntime-cluster src/platforms/shared/start.c ${standalone_snitch_sources} ${sources}) + + +else() + # snRuntime is added externally, only build required runtime + if(SNITCH_RUNTIME STREQUAL "snRuntime") + # Generic Runtime (requiring bootloader/OS) + add_snitch_library(snRuntime src/platforms/shared/start.c src/start.S ${sources}) + # Bare Runtimes (with startup code) + elseif(SNITCH_RUNTIME STREQUAL "snRuntime-cluster") + add_snitch_library(snRuntime-cluster src/platforms/shared/start.c ${standalone_snitch_sources} ${sources}) + else() + message(FATAL_ERROR "Requested runtime not implemented: ${SNITCH_RUNTIME}") + endif() +endif() + +# Tests +enable_testing() +set(SNITCH_TEST_PREFIX snRuntime-) + +# General snRuntime tests +add_snitch_test(tls tests/tls.c) +add_snitch_test(simple tests/simple.c) +add_snitch_test(varargs_1 tests/varargs_1.c) +add_snitch_test(varargs_2 tests/varargs_2.c) +add_snitch_test(barrier tests/barrier.c) +add_snitch_test(fence_i tests/fence_i.c) +add_snitch_test(interrupt-local tests/interrupt-local.c) +add_snitch_test(printf_simple tests/printf_simple.c) + +# RTL only tests +if(SNITCH_RUNTIME STREQUAL "snRuntime-cluster") + add_snitch_test(dma_simple tests/dma_simple.c) + add_snitch_test(atomics tests/atomics.c) +endif() diff --git a/software/snRuntime/README.md b/software/snRuntime/README.md new file mode 100644 index 0000000..d19e74e --- /dev/null +++ b/software/snRuntime/README.md @@ -0,0 +1,48 @@ +# Snitch Runtime Library + +This library implements a minimal runtime for Snitch systems, which is responsible for the following: + +- Detecting the hardware configuration (cores, clusters, ISA extensions, TCDM) +- Passing a descriptor struct to the executable +- Synchronization across cores and clusters +- Team-based multithreading and work splitting + +## General Runtime + +The general runtime (`libsnRuntime`) relies on a bootloader or operating system to load the executable. This usually requires virtual memory to map the segments to the correct addresses. The general runtime does not provide any startup code in this scenario, but is more like a regular library providing some useful API. + +## Bare Runtime + +The bare runtimes (`libsnRuntime-`) assumes that the executable it is being linked into will run in a bare-metal fashion with no convenient bootloader or virtual memory setup. For this scenario, the runtime provides the `_start` symbol and implements a basic crt0. + +## Usage + +The runtime library can be compiled as follows: + + mkdir build + cd build + cmake .. + make + +The tests can be executed as follows: + + make test + +Interesting CMake options that can be set via `-D