Skip to content
Draft
Show file tree
Hide file tree
Changes from all commits
Commits
Show all changes
31 commits
Select commit Hold shift + click to select a range
5e2d0dc
Add minimal bug example.
andreaskuster Jan 8, 2021
f4763fc
Further reduce minimal example.
andreaskuster Jan 8, 2021
89f87c3
Account for offset to center.
andreaskuster Jan 8, 2021
e1caeb1
Add fpga0 sdk env vars script
andreaskuster Jan 9, 2021
34389ae
Add larger jacobi3d example
andreaskuster Jan 9, 2021
b1bac07
Add temporary fix.
andreaskuster Jan 9, 2021
d300e59
Merge remote-tracking branch 'origin/fix_cross_dependent_path' into f…
andreaskuster Jan 9, 2021
e83e4e3
Increase problem size.
andreaskuster Jan 9, 2021
9da97a3
Add more complex example.
andreaskuster Jan 10, 2021
74f8d21
Merge remote-tracking branch 'origin/fix_cross_dependent_path' into f…
andreaskuster Jan 10, 2021
4000eca
Add minimal bug example.
andreaskuster Jan 8, 2021
f0e2e3b
Further reduce minimal example.
andreaskuster Jan 8, 2021
8023480
Account for offset to center.
andreaskuster Jan 8, 2021
0afabe0
Add temporary fix.
andreaskuster Jan 9, 2021
c4b83c5
Add fpga0 sdk env vars script
andreaskuster Jan 9, 2021
ab8c555
Add larger jacobi3d example
andreaskuster Jan 9, 2021
47cc666
Add more complex example.
andreaskuster Jan 10, 2021
ed1dcb8
Increase problem size.
andreaskuster Jan 9, 2021
97fe0fd
Merge remote-tracking branch 'origin/fix_cross_dependent_path' into f…
andreaskuster Sep 11, 2021
ff683e1
Make example more distinct to test function correctness.
andreaskuster Sep 11, 2021
0e35d03
Add path inclusion for direct file execution.
andreaskuster Sep 11, 2021
fb9966c
Several readme extension
andreaskuster Sep 11, 2021
67a6b93
Remove horidiff hotfix
andreaskuster Sep 11, 2021
47dfc58
Update README.md
andreaskuster Sep 13, 2021
6b737df
Update README.md
andreaskuster Sep 13, 2021
6a27a5f
Move test config to default location
andreaskuster Sep 13, 2021
b9ae43b
Remove local env setup
andreaskuster Sep 13, 2021
2a73675
Merge remote-tracking branch 'origin/fix_cross_dependent_path' into f…
andreaskuster Sep 13, 2021
86ab704
Add extended horidiff example.
andreaskuster Sep 14, 2021
d442f68
Reduce min channel depth to 1024
andreaskuster Sep 15, 2021
6a0cf1b
Update dace version
andreaskuster Sep 15, 2021
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
21 changes: 21 additions & 0 deletions README.md
Original file line number Diff line number Diff line change
Expand Up @@ -13,6 +13,7 @@ To run the code, the following software must be available:
- Python 3.6.x or newer.
- The `virtualenv` module (installed with `pip install virtualenv`).
- A C++17-capable compiler (e.g., GCC 7.x or Clang 6.x).
- graphviz (for graph plotting support)
- One or both FPGA compilers:
- Intel FPGA OpenCL SDK (tested with 18.1.1 and 19.1)
- Xilinx Vitis (tested with 2020.2)
Expand Down Expand Up @@ -47,6 +48,13 @@ kernel source files themselves in:
.dacecache/<kernel name>/src/intel_fpga/device
```

To run low-level analysis of the buffer size and stencil program visualization, you can invoke the executable `stencilflow/kernel_chain_graph.py`.
Example usage:

```bash
stencilflow/kernel_chain_graph.py -stencil_file test/stencils/jacobi3d_32x32x32_8itr_8vec.json -plot -simulate -report -optimize
```

Verification
------------

Expand Down Expand Up @@ -81,3 +89,16 @@ It is a known issue that launching multiple Intel FPGA kernels in quick
succession (such as is done in the tests) can sometimes fail sporadically,
seemingly due to file I/O issues. Running individual programs should never fail.

Publication
-----------

If you use StencilFlow, cite us:
```bibtex
@inproceedings{dace,
author = {Johannes de~Fine~Licht, Andreas Kuster, Tiziano De~Matteis, Tal Ben-Nun, Dominic Hofer, Torsten Hoefler},
title = {StencilFlow: Mapping Large Stencil Programs to Distributed Spatial Computing Systems},
year = {2021},
booktitle = {Proceedings of the IEEE/ACM International Symposium on Code Generation and Optimization (CGO'21)},
series = {CGO '21}
}
```
2 changes: 1 addition & 1 deletion dace
94 changes: 63 additions & 31 deletions stencilflow/kernel_chain_graph.py
Original file line number Diff line number Diff line change
Expand Up @@ -15,6 +15,9 @@
import operator
import re
import os
import sys

sys.path.append(os.path.dirname(os.path.dirname(__file__)))

from typing import Any, List, Dict, Tuple

Expand Down Expand Up @@ -289,14 +292,14 @@ def add_channels(self) -> None:
name = src.name + "_" + dest.name
channel = {
"name":
name,
name,
"delay_buffer":
self.kernel_nodes[dest.name].delay_buffer[
src.name],
self.kernel_nodes[dest.name].delay_buffer[
src.name],
"internal_buffer":
dest.internal_buffer[src.name],
dest.internal_buffer[src.name],
"data_type":
src.data_type
src.data_type
}
# add channel reference to global channel dictionary
self.channels[name] = channel
Expand All @@ -314,18 +317,18 @@ def add_channels(self) -> None:
name = src.name + "_" + dest.name
channel = {
"name":
name,
name,
"delay_buffer":
self.kernel_nodes[dest.name].delay_buffer[
src.name],
self.kernel_nodes[dest.name].delay_buffer[
src.name],
"internal_buffer":
dest.internal_buffer[src.name],
dest.internal_buffer[src.name],
"data_type":
src.data_type,
src.data_type,
"input_dims":
self.inputs[src.name]["input_dims"]
if "input_dims" in self.inputs[src.name]
else None
self.inputs[src.name]["input_dims"]
if "input_dims" in self.inputs[src.name]
else None
}
# add channel reference to global channel dictionary
self.channels[name] = channel
Expand All @@ -342,13 +345,13 @@ def add_channels(self) -> None:
name = src.name + "_" + dest.name
channel = {
"name":
name,
name,
"delay_buffer":
self.output_nodes[dest.name].delay_buffer[
src.name],
self.output_nodes[dest.name].delay_buffer[
src.name],
"internal_buffer": {},
"data_type":
src.data_type
src.data_type
}
# add channel reference to global channel dictionary
self.channels[name] = channel
Expand Down Expand Up @@ -386,16 +389,16 @@ def import_input(self) -> None:
else:
i["input_dims"] = stencilflow.ITERATORS[len(stencilflow.
ITERATORS) -
self.kernel_dimensions:]
self.kernel_dimensions:]
self.outputs = inp["outputs"]
# handle stencil program output dimensions
if self.kernel_dimensions == 1: # 1D
for entry in self.program:
self.program[entry]["computation_string"] = \
self.program[entry]["computation_string"].replace("[", "[i, j,") # add two extra indices
self.dimensions = [
1, 1
] + inp["dimensions"] # add two extra dimensions
1, 1
] + inp["dimensions"] # add two extra dimensions
elif self.kernel_dimensions == 2: # 2D
for entry in self.program:
self.program[entry]["computation_string"] = self.program[entry]["computation_string"] \
Expand Down Expand Up @@ -489,28 +492,37 @@ def compute_delay_buffer(self) -> None:
order = list(nx.topological_sort(self.graph))
except nx.exception.NetworkXUnfeasible:
cycle = next(nx.algorithms.cycles.simple_cycles(self.graph))
raise ValueError("Cycle detected: {}".format(
[c.name for c in cycle]))
raise ValueError("Cycle detected: {}".format([c.name for c in cycle]))
# go through all nodes
for node in order:
# process delay buffer (no additional delay buffer will appear because of the topological order)
for inp in node.input_paths:

# add internal buffer latency for internal computation
if not isinstance(node, Output):
for entry in node.input_paths[inp]:
name = entry[-1]
entry[2] += node.dist_to_center[name]

# compute maximum delay size per input
max_delay = max(node.input_paths[inp])
max_delay[
2] += 1 # add an extra delay cycle for the processing in the kernel node
max_delay[2] += 1 # add an extra delay cycle for the processing in the kernel node
# loop over all inputs and set their size relative to the max size to have data ready at the exact
# same time
for entry in node.input_paths[inp]:
name = entry[-1]
max_size = stencilflow.convert_3d_to_1d(
dimensions=self.dimensions,
index=stencilflow.list_subtract_cwise(
max_delay[:-1], entry[:-1]))
node.delay_buffer[name] = BoundedQueue(name=name,
maxsize=max_size)
node.delay_buffer[name].import_data(
[None] * node.delay_buffer[name].maxsize)
index=stencilflow.list_subtract_cwise(max_delay[:-1], entry[:-1]))
node.delay_buffer[name] = BoundedQueue(name=name, maxsize=max_size)
node.delay_buffer[name].import_data([None] * node.delay_buffer[name].maxsize)

# remove internal buffer latency for internal computation
if not isinstance(node, Output):
for entry in node.input_paths[inp]:
name = entry[-1]
entry[2] -= node.dist_to_center[name]

# set input node delay buffers to 1
if isinstance(node, Input):
node.delay_buffer = BoundedQueue(name=node.name,
Expand Down Expand Up @@ -716,7 +728,7 @@ def report(self, name):
u.name, v.name, entry.name, entry.maxsize))
total_fast += entry.maxsize
print("buffer size slow memory: {} \nbuffer size fast memory: {}".format(
total_slow, total_fast))
total_slow, total_fast))

def operation_count(self):
"""For each operation type found in the ASTs, return a tuple of
Expand Down Expand Up @@ -789,6 +801,14 @@ def runtime_lower_bound(self):
type=int)
parser.add_argument("-report", action="store_true")
parser.add_argument("-simulate", action="store_true")
parser.add_argument("-opt", action="store_true")
parser.add_argument("-opt_goal", default=["min_fast_mem", 12000], nargs="+")
"""
choices:
- min_com_vol, FAST_MEM_BOUND, SLOW_MEM_BOUND
- min_fast_mem, COM_VOL_BOUND
- opt_ratio, RATIO
"""
args = parser.parse_args()
args.log_level = stencilflow.log_level.LogLevel(args.log_level)
program_description = stencilflow.parse_json(args.stencil_file)
Expand All @@ -809,6 +829,18 @@ def runtime_lower_bound(self):
log_level=LogLevel(args.log_level))
sim.simulate()

# choose optimization goal
if args.opt:
from stencilflow import Optimizer

opt = Optimizer(self.kernel_nodes, self.dimensions)
if args.opt_goal[0] == "min_com_vol":
opt.minimize_comm_vol(fast_memory_bound=args.opt_goal[1], slow_memory_bound=args.opt_goal[2])
if args.opt_goal[0] == "min_fast_mem":
opt.minimize_fast_mem(communication_volume_bound=args.opt_goal[1])
if args.opt_goal[0] == "opt_ratio":
opt.optimize_to_ratio(ratio=args.opt_goal[1])

# output a report if argument -report is true
if args.report:
chain.report(args.stencil_file)
Expand Down
2 changes: 1 addition & 1 deletion stencilflow/sdfg_generator.py
Original file line number Diff line number Diff line change
Expand Up @@ -28,7 +28,7 @@

import networkx as nx

MINIMUM_CHANNEL_DEPTH = 2048
MINIMUM_CHANNEL_DEPTH = 1024

NUM_BANKS = 4

Expand Down
84 changes: 84 additions & 0 deletions test/stencils/horidiff_min.json
Original file line number Diff line number Diff line change
@@ -0,0 +1,84 @@
{
"inputs": {
"inA": {
"data": "inA_float32.dat",
"data_type": "float32",
"input_dims": [
"i"
]
}
},
"outputs": [
"out"
],
"dimensions": [
10,
10,
10
],
"vectorization": 1,
"program": {
"k0": {
"data_type": "float32",
"computation_string": "k0 = inA[i]",
"boundary_conditions": {
"inA": {
"type": "constant",
"value": 0.0
}
}
},
"k1": {
"data_type": "float32",
"computation_string": "k1 = inA[i]",
"boundary_conditions": {
"inA": {
"type": "constant",
"value": 0.0
}
}
},
"k2": {
"data_type": "float32",
"computation_string": "k2 = k1[i, j, k] + k0[i+1, j, k] + k0[i, j, k]",
"boundary_conditions": {
"k1": {
"type": "constant",
"value": 0.0
},
"k0": {
"type": "constant",
"value": 0.0
}
}
},
"k3": {
"data_type": "float32",
"computation_string": "k3 = k0[i, j, k] + k1[i+1, j+1, k+1] + k1[i, j, k]",
"boundary_conditions": {
"k0": {
"type": "constant",
"value": 0.0
},
"k1": {
"type": "constant",
"value": 0.0
}
}
},
"out": {
"data_type": "float32",
"computation_string": "out = k2[i, j, k] + k3[i, j, k]",
"boundary_conditions": {
"k2":{
"type": "constant",
"value": 0.0
},
"k3": {
"type": "constant",
"value": 0.0
}
}
}
}
}
Loading