google · alanvgreen · Apr 19, 2022 · Apr 18, 2022 · Apr 19, 2022 · Apr 18, 2022
diff --git a/docs/source/setup-guide.rst b/docs/source/setup-guide.rst
@@ -45,6 +45,12 @@ This updates submodules, builds some local executables, and installs missing Lin
    cd CFU-Playground
    ./scripts/setup
 
+If you intend to use Amaranth to build CFUs, you may need a compatible version of Yosys,
+which can be installed with:
+
+.. code-block:: bash
+
+ pip3 install amaranth-yosys
 
 Step 4: Install Toolchain
 --------------------------------------------

diff --git a/proj/fccm_tutorial/Makefile b/proj/fccm_tutorial/Makefile
@@ -0,0 +1,55 @@
+#!/bin/env python
+# Copyright 2020 Google LLC
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     https://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+# This variable lists symbols to define to the C preprocessor
+export DEFINES :=
+
+# Uncomment this line to use software defined CFU functions in software_cfu.cc
+#DEFINES += CFU_SOFTWARE_DEFINED
+
+# Uncomment these line to print parameters of the conv2d operation
+DEFINES += CONV_PRINT_PARAMS
+DEFINES += DEPTHWISE_PRINT_PARAMS
+
+# Uncomment this line to allow acceleration
+DEFINES += CONV_ACCELERATE
+DEFINES += DEPTHWISE_ACCELERATE
+
+# Uncomment this line to skip debug code (large effect on performance)
+DEFINES += NDEBUG
+
+# Uncomment this line to skip individual profiling output (has minor effect on performance).
+#DEFINES += NPROFILE
+
+# Uncomment to include specified model in built binary
+#DEFINES += INCLUDE_MODEL_PDTI8
+#DEFINES += INCLUDE_MODEL_MICRO_SPEECH
+#DEFINES += INCLUDE_MODEL_MAGIC_WAND
+#DEFINES += INCLUDE_MODEL_MNV2
+#DEFINES += INCLUDE_MODEL_HPS
+#DEFINES += INLCUDE_MODEL_MLCOMMONS_TINY_V01_ANOMD
+#DEFINES += INLCUDE_MODEL_MLCOMMONS_TINY_V01_IMGC
+DEFINES += INLCUDE_MODEL_MLCOMMONS_TINY_V01_KWS
+#DEFINES += INLCUDE_MODEL_MLCOMMONS_TINY_V01_VWW
+
+# Uncomment to include all TFLM examples (pdti8, micro_speech, magic_wand)
+#DEFINES += INCLUDE_ALL_TFLM_EXAMPLES
+
+
+# Defaults to Symbiflow and Arty
+TARGET=digilent_arty
+USE_SYMBIFLOW=1
+
+include ../proj.mk
diff --git a/proj/fccm_tutorial/README.md b/proj/fccm_tutorial/README.md
@@ -0,0 +1,73 @@
+# FCCM Tutorial Example
+
+This example accelerator was created for the FCCM 2022 tutorial 
+"CFU-Playground: Build Your Own Custom TinyML Processor".
+
+https://www.fccm.org/workshop-tutorial-2022/
+
+## Amaranth CFU
+
+`cfu.py` contains a complete CFU written in Amaranth. It can perform
+these functions:
+
+* Operation 0: Reset accumulator
+* Operation 1: 4-way multiply accumulate.
+* Operation 2: Read accumulator
+
+Test cases can be run by executing `cfu.py`:
+
+```
+$ ../../scripts/pyrun cfu.py
+```
+
+## Building and Running
+
+To build and program a Digilent Arty board, first follow the standard [setup
+instructions](https://cfu-playground.readthedocs.io/en/latest/setup-guide.html)
+to install Symbiflow and a RISCV compiler. Then:
+
+```
+$ make TARGET=digilent_arty USE_SYMBIFLOW=1 prog
+```
+
+You should see the familiar flashing lights, then:
+
+```
+$ make TARGET=digilent_arty USE_SYMBIFLOW=1 BUILD_JOBS=8 prog
+```
+
+This will load the software and start a terminal. Interesting options are:
+
+* 1 (Models), 1 (person detection int 8), 1 (person)   [[check this]]
+* 3 (Project menu), 1 (Exercise CFU)
+
+To ignore the CFU when running models, comment out this line:
+
+```
+# DEFINES += ACCEL_CONV2D
+```
+
+With gateware ignored, the inference times are very close to the inference
+times as measured with `proj/proj_template`.
+
+To use the CFU operation emulator defined in the `src/software_cfu.cc` file,
+uncomment this line:
+
+```
+# DEFINES += CFU_SOFTWARE_DEFINED
+```
+
+While it is much slower, it is often convenient to use emulated operations
+while debugging.
+
+
+## `proj_menu.cc`
+
+Contains snippets demonstrating the integration of the CFU with the SoC.
+
+[[Insert instructions here]]
+
+## Tensorflow Lite for Microcontrollers
+
+
+
diff --git a/proj/fccm_tutorial/cfu.py b/proj/fccm_tutorial/cfu.py
@@ -0,0 +1,173 @@
+#!/bin/env python
+# Copyright 2020 Google LLC
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     https://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+from amaranth import *
+from amaranth.sim import Delay, Tick
+from amaranth_cfu import TestBase, SimpleElaboratable, pack_vals, CfuBase, CfuTestBase
+import unittest
+
+class MultiplyAccumulate4(SimpleElaboratable):
+    """Performs four, 8 bit wide multiply-accumulates in parallel.
+
+    Uses `SimpleElaboratable` helper class as a convenience.
+    """
+    def __init__(self):
+        # "a" and "b" inputs - each four, 8 bit signed numbers
+        self.a_word = Signal(32)
+        self.b_word = Signal(32)
+
+        # clear to reset accumulator, enable to perform multiply- accumulate
+        self.clear = Signal()
+        self.enable = Signal()
+
+        # result
+        self.accumulator = Signal(signed(32))
+
+    def elab(self, m):
+        """The actual gateware produced"""
+
+        # Divide a_word and b_word each into four, 8-bit parts
+        a_bytes = [self.a_word[i:i+8].as_signed() for i in range(0, 32, 8)]
+        b_bytes = [self.b_word[i:i+8].as_signed() for i in range(0, 32, 8)]
+
+        # Calculate the sum of (a+offset)*b for each part
+        calculations = [(a + Const(128)) * b for a, b in zip(a_bytes, b_bytes)]
+        summed = Signal(signed(32))
+        m.d.comb += summed.eq(sum(calculations))
+
+        with m.If(self.clear):
+            m.d.sync += self.accumulator.eq(0)
+        with m.Elif(self.enable):
+            m.d.sync += self.accumulator.eq(self.accumulator + summed)
+
+
+class MultiplyAccumulate4Test(TestBase):
+    def create_dut(self):
+        return MultiplyAccumulate4()
+
+    def test(self):
+
+        def a(a, b, c, d): return pack_vals(a, b, c, d, offset=-128)
+        def b(a, b, c, d): return pack_vals(a, b, c, d, offset=0)
+        DATA = [
+            # (a_word, b_word, enable, clear), expected accumulator
+            ((a(0, 0, 0, 0),  b(0, 0, 0, 0), 0, 0), 0),
+
+            # Simple tests: with just first byte
+            ((a(10, 0, 0, 0), b(3, 0, 0, 0),  1, 0),   0),
+            ((a(11, 0, 0, 0), b(-4, 0, 0, 0), 1, 0),  30),
+            ((a(11, 0, 0, 0), b(-4, 0, 0, 0), 0, 0), -14),
+            # Since was not enabled last cycle, accumulator will not change
+            ((a(11, 0, 0, 0), b(-4, 0, 0, 0), 1, 0), -14),
+            # Since was enabled last cycle, will change accumlator
+            ((a(11, 0, 0, 0), b(-4, 0, 0, 0), 0, 1), -58),
+            # Accumulator cleared
+            ((a(11, 0, 0, 0), b(-4, 0, 0, 0), 0, 0),  0),
+
+            # Uses all bytes (calculated on a spreadsheet)
+            ((a(99, 22, 2, 1),      b(-2, 6, 7, 111), 1, 0),             0),
+            ((a(2, 45, 79, 22),     b(-33, 6, -97, -22), 1, 0),         59),
+            ((a(23, 34, 45, 56),    b(-128, -121, 119, 117), 1, 0),  -7884),
+            ((a(188, 34, 236, 246), b(-87, 56, 52, -117), 1, 0),     -3035),
+            ((a(131, 92, 21, 83),   b(-114, -72, -31, -44), 1, 0),  -33997),
+            ((a(74, 68, 170, 39),   b(102, 12, 53, -128), 1, 0),    -59858),
+            ((a(16, 63, 1, 198),    b(29, 36, 106, 62), 1, 0),      -47476),
+            ((a(0, 0, 0, 0),        b(0, 0, 0, 0), 0, 1),           -32362),        
+        ]
+
+        dut = self.dut
+
+        def process():
+            for (a_word, b_word, enable, clear), expected in DATA:
+                yield dut.a_word.eq(a_word)
+                yield dut.b_word.eq(b_word)
+                yield dut.enable.eq(enable)
+                yield dut.clear.eq(clear)
+                yield Delay(0.1) # Wait for input values to settle
+
+                # Check on accumulator, as calcuated last cycle
+                self.assertEqual(expected, (yield dut.accumulator))
+                yield Tick()
+
+        self.run_sim(process, write_trace=False)
+
+
+class Cfu(CfuBase):
+    """Simple CFU that provides access to a MultiplyAccumulate4.
+
+    The supported operations are:
+        * Operation 0: Reset accumulator
+        * Operation 1: 4-way multiply accumulate.
+        * Operation 2: Read accumulator
+
+    The implementation here assumes the CPU is always ready to read a response.
+    """
+
+    def elab(self, m):
+        # Build the submodule
+        m.submodules.macc4 = macc4 = MultiplyAccumulate4()
+
+        # Check operation number
+        funct3 = Signal(3)
+        m.d.comb += funct3.eq(self.cmd_function_id[:3])
+
+        # All commands take 1 cycle. CFU is always read to receive a command
+        m.d.comb += self.cmd_ready.eq(1)
+
+        # There is only one response, and it is always valid
+        m.d.comb += self.rsp_out.eq(macc4.accumulator)
+        m.d.comb += self.rsp_valid.eq(1)
+
+        # Inputs to Macc4 always set to CFU inputs
+        m.d.comb += macc4.a_word.eq(self.cmd_in0)
+        m.d.comb += macc4.b_word.eq(self.cmd_in1)
+
+        # clear on zero, enable on 1
+        m.d.comb += macc4.clear.eq(self.cmd_valid & (funct3 == 0))
+        m.d.comb += macc4.enable.eq(self.cmd_valid & (funct3 == 1))
+
+def make_cfu():
+    return Cfu()
+
+class CfuTest(CfuTestBase):
+    def create_dut(self):
+        return make_cfu()
+
+    def test(self):
+        "Tests CFU plumbs to Madd4 correctly"
+        def a(a, b, c, d): return pack_vals(a, b, c, d, offset=-128)
+        def b(a, b, c, d): return pack_vals(a, b, c, d, offset=0)
+        # These values were calculated with a spreadsheet
+        DATA = [
+            # ((fn3, op1, op2), result)
+            ((0, 0, 0), None),  #reset
+            ((1, a(130, 7, 76, 47), b(104, -14, -24, 71)), None), # calculate
+            ((1, a(84, 90, 36, 191), b(109, 57, -50, -1)), None),
+            ((1, a(203, 246, 89, 178), b(-87, 26, 77, 71)), None),
+            ((1, a(43, 27, 78, 167), b(-24, -8, 65, 124)), None),
+            ((2, 0, 0), 59986), # read result
+
+            ((0, 0, 0), None),  #reset
+            ((1, a(67, 81, 184, 130), b(81, 38, -116, 65)), None),
+            ((1, a(208, 175, 180, 198), b(-120, -70, 8, 11)), None),
+            ((1, a(185, 81, 101, 108), b(90, 6, -92, 83)), None),
+            ((1, a(219, 216, 114, 236), b(-116, -9, -109, -16)), None),
+            ((2, 0, 0), -64723), # read result
+        ]
+        self.run_ops(DATA)
+
+
+if __name__ == '__main__':
+    unittest.main()
diff --git a/proj/fccm_tutorial/cfu_gen.py b/proj/fccm_tutorial/cfu_gen.py
@@ -0,0 +1,38 @@
+# Copyright 2021 The CFU-Playground Authors
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#      http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+import os.path
+from amaranth import *
+from amaranth.back import rtlil, verilog
+
+from cfu import make_cfu
+
+VERILOG_FILENAME = "cfu.v"
+
+def read_file():
+    if os.path.exists(VERILOG_FILENAME):
+        with open(VERILOG_FILENAME, "r") as f:
+            return f.read()
+    return None
+
+def main():
+    cfu = make_cfu()
+    new_verilog = verilog.convert(cfu, name='Cfu', ports=cfu.ports)
+    old_verilog = read_file()
+    if new_verilog != old_verilog:
+        with open(VERILOG_FILENAME, "w") as f:
+            f.write(new_verilog)
+
+if __name__ == '__main__':
+    main()
diff --git a/proj/fccm_tutorial/ci/ci_build_params.txt b/proj/fccm_tutorial/ci/ci_build_params.txt
diff --git a/proj/fccm_tutorial/ci/ci_exclude_targets.txt b/proj/fccm_tutorial/ci/ci_exclude_targets.txt
@@ -0,0 +1,23 @@
+1bitsquared_icebreaker_bitsy
+colorlight_5a_75x
+decklink_intensity_pro_4k
+digilent_basys3
+digilent_cmod_a7
+ego1
+lattice_crosslink_nx_evn
+lattice_crosslink_nx_vip
+lattice_ecp5_evn
+lattice_ice40up5k_evn
+micronova_mercury2
+muselab_icesugar
+pano_logic_g2
+redpitaya
+simple
+sqrl_fk33
+terasic_deca
+terasic_sockit
+tinyfpga_bx
+trenz_te0725
+xilinx_zybo_z7
+qmtech_xc7a35t
+hps