diff --git a/Makefile b/Makefile index 26a0a89..c312a10 100644 --- a/Makefile +++ b/Makefile @@ -41,8 +41,9 @@ SOURCES = cpu.cc cpzero.cc devicemap.cc \ routerinterface.cc routerinterface.h router.cc router.h \ accelerator.h accelerator.cc \ remoteram.h remoteram.cc cma.h cma.cc cmamodules.cc cmamodules.h \ - cmaAddressMap.h dbuf.h dbuf.cc snacc.cc snacc.h snacccore.c snacccore.h \ - snaccAddressMap.h snaccmodules.c snaccmodules.h + cmaAddressMap.h dbuf.h dbuf.cc snacc.cc snacc.h snacccore.cc snacccore.h \ + snaccAddressMap.h snaccmodules.cc snaccmodules.h \ + debugutils.cc debugutils.h OBJECTS = cpu.$(OBJEXT) cpzero.$(OBJEXT) devicemap.$(OBJEXT) \ mapper.$(OBJEXT) options.$(OBJEXT) range.$(OBJEXT) \ @@ -58,7 +59,8 @@ OBJECTS = cpu.$(OBJEXT) cpzero.$(OBJEXT) devicemap.$(OBJEXT) \ routerinterface.${OBJEXT} router.${OBJEXT} \ remoteram.${OBJEXT} accelerator.${OBJEXT} \ cma.${OBJEXT} cmamodules.${OBJEXT} dbuf.${OBJEXT} \ - snacc.${OBJEXT} snacccore.${OBJEXT} snaccmodules.${OBJEXT} + snacc.${OBJEXT} snacccore.${OBJEXT} snaccmodules.${OBJEXT} \ + debugutils.${OBJEXT} LDADD = libopcodes_mips/libopcodes_mips.a @@ -144,7 +146,7 @@ vmips.o: vmips.cc clock.h task.h types.h config.h \ testdev.h stub-dis.h libopcodes_mips/bfd.h libopcodes_mips/ansidecl.h \ libopcodes_mips/symcat.h libopcodes_mips/dis-asm.h rommodule.h \ interactor.h rs232c.h routerinterface.h remoteram.h accelerator.h \ - cma.h snacc.h dmac.h + cma.h snacc.h dmac.h debugutils.h deviceint.o: deviceint.cc deviceint.h intctrl.h types.h config.h \ vmips.h @@ -152,7 +154,7 @@ deviceint.o: deviceint.cc deviceint.h intctrl.h types.h config.h \ debug.o: debug.cc debug.h deviceexc.h accesstypes.h types.h config.h \ remotegdb.h cpu.h \ vmips.h mapper.h range.h \ - excnames.h cpzeroreg.h options.h + excnames.h cpzeroreg.h options.h debugutils.h remotegdb.o: remotegdb.cc remotegdb.h types.h config.h @@ -237,22 +239,26 @@ routerinterface.o: routerinterface.cc routerinterface.h\ router.o: router.cc router.h vmips.h options.h accelerator.o: accelerator.h accelerator.cc \ - range.h router.h error.h options.h vmips.h + range.h router.h error.h options.h vmips.h debugutils.h -remoteram.o: remoteram.cc remoteram.h accelerator.h memorymodule.h +remoteram.o: remoteram.cc remoteram.h accelerator.h \ + memorymodule.h debugutils.h dbuf.o: dbuf.h dbuf.cc range.h types.h fileutils.h vmips.h options.h cma.o: cma.cc cma.h accelerator.h dbuf.h accesstypes.h\ - types.h cmamodules.h cmaAddressMap.h + types.h cmamodules.h cmaAddressMap.h debugutils.h cmamodules.o: cmamodules.h cmamodules.cc cmaAddressMap.h range.h \ dbuf.h accelerator.h -snacc.o: snacc.h snacc.cc dbuf.h snaccAddressMap.h snaccmodules.h +snacc.o: snacc.h snacc.cc dbuf.h snaccAddressMap.h snaccmodules.h \ + debugutils.h snacccore.o: snacccore.h snacccore.cc vmips.h options.h \ snaccmodules.h snaccAddressMap.h snaccmodules.o: snaccmodules.h snaccmodules.cc snaccAddressMap.h \ - accesstypes.h \ No newline at end of file + accesstypes.h + +debugutils.o: debugutils.cc debugutils.h devicemap.h vmips.h mapper.h diff --git a/README.md b/README.md index 61328dd..df12e76 100644 --- a/README.md +++ b/README.md @@ -67,8 +67,9 @@ VMIPSから備わっているオプションに関しては[VMIPSのドキュメ * dcachebnum: データキャッシュのブロック数 (数値) #### メモリアクセス関連 * mem_bandwidth: メモリバンド幅 (ワード数を指定する) (数値) -* bus_latency: 内部バスにおける遅延サイクル数 (数値) +* bus_latency: バスアクセス権獲得後にメモリモジュールに要求が到達するまでのサイクル数 (数値) * exmem_latency: 外部メモリにおける遅延サイクル数 (数値) + #### ルータ関連 * vcbufsize: virtual channelごとのバッファサイズ (数値) * routermsg: ルータにおけるメッセージ表示有効化 (flag) @@ -77,7 +78,7 @@ VMIPSから備わっているオプションに関しては[VMIPSのドキュメ * accelerator1: 1番目のアクセラレータ (文字列) * accelerator2: 2番目のアクセラレータ (文字列) -#### SNACC向けオプション +#### SNACC用オプション * snacc_sram_latency: オンチップSRAMのレイテンシ (数値) * snacc_inst_dump: コアで実行した命令をダンプする (文字列: 形式 "(チップID,コアID)") * snacc_mad_debug: コアのmadユニットでの計算を表示する (文字列: 形式 "(チップID,コアID)") @@ -89,3 +90,4 @@ VMIPSから備わっているオプションに関しては[VMIPSのドキュメ * cacheprof: キャッシュアクセス数、ミス率など (bool) * routerprof: 転送フリット数など (bool) * exmemprof: 外部メモリへのアクセス数 (bool) + diff --git a/accelerator.h b/accelerator.h index 92cf1c0..359976c 100644 --- a/accelerator.h +++ b/accelerator.h @@ -4,6 +4,7 @@ #include "router.h" #include "types.h" #include "range.h" +#include "debugutils.h" #define DONE_NOTIF_ADDR 0x00000 #define DMAC_NOTIF_ADDR 0x00001 @@ -107,7 +108,7 @@ class NetworkInterfaceConfig : public Range { }; //abstract class for accelerator top module -class CubeAccelerator { +class CubeAccelerator : public DebuggerClient { private: //data to/from router bool iready[VCH_SIZE]; diff --git a/cma.cc b/cma.cc index 9a58153..eb95ec2 100644 --- a/cma.cc +++ b/cma.cc @@ -1,4 +1,5 @@ #include "cma.h" +#include "debugutils.h" using namespace CMAComponents; @@ -17,7 +18,7 @@ CMA::CMA(uint32 node_ID, Router* upperRouter) imem = new DoubleBuffer(CMA_IMEM_SIZE, CMA_IWORD_MASK); // const regs - const_reg = new ConstRegCtrl(CMA_CONST_SIZE, pearray); + const_reg = new ConstRegCtrl(CMA_CONST_SIZE * 2, pearray); // Data manipulator ld_unit = new LDUnit(CMA_PE_ARRAY_WIDTH, &dbank, pearray); @@ -37,6 +38,10 @@ CMA::CMA(uint32 node_ID, Router* upperRouter) // Microcontroller mc = new MicroController(imem, ld_unit, st_unit, &mc_done); + //for debugger + debug_op = DBG_CMD_NOP; + resp_data = 0; + } CMA::~CMA() @@ -117,4 +122,80 @@ void CMA::core_step() mc_working = false; mc->reset(); } -} \ No newline at end of file +} + +void CMA::send_commnad(uint32 cmd, uint32 arg) { + uint8 func, mod, offset; + __cmd_parser(cmd, debug_op, func, mod, offset); + switch (debug_op) { + case DBG_CMD_SETTRG_OP: + trgr_arg = arg; + trgr_cnd = func; + trgr_mod = mod; + trgr_offset = offset; + break; + case DBG_CMD_WRITE_OP: + debug_store(mod, offset, arg); + break; + case DBG_CMD_READ_OP: + resp_data = debug_fetch(mod, offset); + break; + default: + debug_op = DBG_CMD_NOP; + } +} + +bool CMA::isTriggered() +{ + uint32 mod_val = debug_fetch(trgr_mod, trgr_offset); + fprintf(stderr, "mod(%d, %d) %X arg %X\n", trgr_mod, trgr_offset, mod_val, trgr_arg); + return __compare(mod_val, trgr_arg, trgr_cnd); +} + +uint32 CMA::get_dbg_data() +{ + return resp_data; +} + +uint32 CMA::debug_fetch(uint8 mod, uint8 offset) +{ + switch (mod) { + case CMA_DEBUG_MOD_PC: + return mc->debug_fetch_pc(); + case CMA_DEBUG_MOD_RF: + return mc->debug_fetch_regfile(offset); + case CMA_DEBUG_MOD_LR: + return pearray->debug_fetch_launch(offset); + case CMA_DEBUG_MOD_GR: + return pearray->debug_fetch_gather(offset); + case CMA_DEBUG_MOD_ALU_L: + case CMA_DEBUG_MOD_ALU_R: + case CMA_DEBUG_MOD_ALU_O: + return pearray->debug_fetch_ALU(offset, mod); + default: + return 0; + } +} + +void CMA::debug_store(uint8 mod, uint8 offset, uint32 data) +{ + switch (mod) { + case CMA_DEBUG_MOD_PC: + mc->debug_store_pc(data); + break; + case CMA_DEBUG_MOD_RF: + mc->debug_store_regfile(offset, data); + break; + case CMA_DEBUG_MOD_LR: + pearray->debug_store_launch(offset, data); + break; + case CMA_DEBUG_MOD_GR: + pearray->debug_store_gather(offset, data); + break; + case CMA_DEBUG_MOD_ALU_L: + case CMA_DEBUG_MOD_ALU_R: + case CMA_DEBUG_MOD_ALU_O: + return pearray->debug_store_ALU(offset, mod, data); + break; + } +} diff --git a/cma.h b/cma.h index c0850f5..b101298 100644 --- a/cma.h +++ b/cma.h @@ -8,6 +8,7 @@ #include "cmaAddressMap.h" #include "dbuf.h" + class DeviceExc; class CubeAccelerator; class MemoryModule; @@ -34,6 +35,14 @@ class CMA : public CubeAccelerator { bool mc_working; bool done_notif; + //for debug + uint8 trgr_cnd, trgr_mod, trgr_offset; + uint8 debug_op; + uint32 trgr_arg; + uint32 resp_data; + uint32 debug_fetch(uint8 mod, uint8 offset); + void debug_store(uint8 mod, uint8 offset, uint32 data); + public: CMA(uint32 node_ID, Router* upperRouter); ~CMA(); @@ -41,9 +50,15 @@ class CMA : public CubeAccelerator { void setup(); void core_step(); void core_reset(); + const char *accelerator_name() { return "CMA"; } CMAComponents::ControlReg *ctrl_reg; + + //for debuger + virtual void send_commnad(uint32 cmd, uint32 arg); + virtual bool isTriggered(); + virtual uint32 get_dbg_data(); }; diff --git a/cmamodules.cc b/cmamodules.cc index f9ecd0e..a80d228 100644 --- a/cmamodules.cc +++ b/cmamodules.cc @@ -257,7 +257,7 @@ void PEArray::make_memports() uint32 PEArray::load_const(uint32 addr) { uint32 index = addr >> 2; - if (index > height * 2) { + if (index >= height * 2) { return 0; } else { return cregs[index]->getData(); @@ -447,6 +447,7 @@ void PEArray::exec() if (config_changed) { analyze_dataflow(); } + for (auto it = tsorted_list.begin(); it != tsorted_list.end(); it++) { PENodeBase *p = *it; p->exec(); @@ -455,10 +456,12 @@ void PEArray::exec() for (int i = 0; i < width; i++) { gather_regs[i]->exec(); } + for (auto it = tsorted_list.begin(); it != tsorted_list.end(); it++) { PENodeBase *p = *it; p->update(); } + } void PEArray::launch(ArrayData input_data) @@ -477,6 +480,70 @@ ArrayData PEArray::gather() return output_data; } +uint32 PEArray::debug_fetch_launch(uint32 col) +{ + return (col < width) ? launch_regs[col]->getData() : 0; +} + +void PEArray::debug_store_launch(uint32 col, uint32 data) +{ + if (col < width) { + launch_regs[col]->debug_push_data(data & CMA_DWORD_MASK); + } +} + +uint32 PEArray::debug_fetch_gather(uint32 col) +{ + return (col < width) ? gather_regs[col]->getData() : 0; +} + +void PEArray::debug_store_gather(uint32 col, uint32 data) +{ + if (col < width) { + gather_regs[col]->debug_push_data(data & CMA_DWORD_MASK); + } +} + +uint32 PEArray::debug_fetch_ALU(uint8 pe_addr, uint8 type) +{ + int x = pe_addr % width; + int y = pe_addr / width; + + if (y >= height) return 0; + + switch (type) { + case CMA_DEBUG_MOD_ALU_L: + return alu_sels[x][y][0]->getData(); + case CMA_DEBUG_MOD_ALU_R: + return alu_sels[x][y][1]->getData(); + case CMA_DEBUG_MOD_ALU_O: + return alus[x][y]->getData(); + default: + return 0; + } +} + +void PEArray::debug_store_ALU(uint8 pe_addr, uint8 type, uint32 data) +{ + int x = pe_addr % width; + int y = pe_addr / width; + + if (y >= height) return; + + switch (type) { + case CMA_DEBUG_MOD_ALU_L: + alu_sels[x][y][0]->debug_push_data(data & CMA_DWORD_MASK); + break; + case CMA_DEBUG_MOD_ALU_R: + alu_sels[x][y][1]->debug_push_data(data & CMA_DWORD_MASK); + break; + case CMA_DEBUG_MOD_ALU_O: + alus[x][y]->debug_push_data(data & CMA_DWORD_MASK); + break; + } + return; +} + DataManipulator::DataManipulator(int interleave_size_, DoubleBuffer*** dbank_, PEArray *pearray_) : @@ -692,6 +759,18 @@ void MicroController::step() pc = next_pc; } +uint32 MicroController::debug_fetch_regfile(uint32 sel) +{ + return (sel < CMA_MC_REG_SIZE) ? (uint32)regfile[sel] : 0; +} + +void MicroController::debug_store_regfile(uint32 sel, uint32 data) +{ + if (sel < CMA_MC_REG_SIZE) { + regfile[sel] = (uint16)data; + } +} + void CCSOTB2::CCSOTB2_PEArray::make_connection() { @@ -865,6 +944,19 @@ void PENodeBase::connect(PENodeBase* pred) } } +void PENodeBase::debug_push_data(uint32 data) +{ + std::queue tmp; + tmp.push(data); + obuf.pop(); //discard first data + while (!obuf.empty()) { + tmp.push(obuf.front()); + tmp.pop(); + } + std::swap(tmp, obuf); +} + + void MUX::exec() { obuf.push(predecessors[config_data]->getData()); diff --git a/cmamodules.h b/cmamodules.h index a15a18a..c922dfd 100644 --- a/cmamodules.h +++ b/cmamodules.h @@ -13,6 +13,14 @@ /*for debug */ #include +//For debug info +#define CMA_DEBUG_MOD_PC 0 +#define CMA_DEBUG_MOD_RF 1 //reg file +#define CMA_DEBUG_MOD_LR 2 //launch reg +#define CMA_DEBUG_MOD_GR 3 //gather reg +#define CMA_DEBUG_MOD_ALU_L 4 //alu input left +#define CMA_DEBUG_MOD_ALU_R 5 //alu input right +#define CMA_DEBUG_MOD_ALU_O 6 //alu output /* word size */ #define CMA_DWORD_MASK 0x1FFFFFF //25bit @@ -279,6 +287,11 @@ namespace CMAComponents { ld_unit(ld_unit_), st_unit(st_unit_) {}; void step(); void reset(); + + uint32 debug_fetch_pc() {return pc; }; + void debug_store_pc(uint32 data) { pc = data; }; + uint32 debug_fetch_regfile(uint32 sel); + void debug_store_regfile(uint32 sel, uint32 data); }; static std::map debug_str; @@ -303,6 +316,8 @@ namespace CMAComponents { virtual void update() { obuf.pop(); }; virtual NodeList use_successors(); + + void debug_push_data(uint32 data); }; class MUX : public PENodeBase { @@ -345,6 +360,7 @@ namespace CMAComponents { uint32 getData(); void writeData(uint32 data); void exec() {}; //nothing to do + void update() {}; //nothing to do bool isUse(CMAComponents::PENodeBase* pred) { return false; }; }; @@ -425,6 +441,14 @@ namespace CMAComponents { void exec(); void update(); + + //for debugger + uint32 debug_fetch_launch(uint32 col); + void debug_store_launch(uint32 col, uint32 data); + uint32 debug_fetch_gather(uint32 col); + void debug_store_gather(uint32 col, uint32 data); + uint32 debug_fetch_ALU(uint8 pe_addr, uint8 type); + void debug_store_ALU(uint8 pe_addr, uint8 type, uint32 data); }; namespace CCSOTB2 { diff --git a/debug.cc b/debug.cc index 1bf719c..08b39ae 100644 --- a/debug.cc +++ b/debug.cc @@ -40,7 +40,7 @@ extern int remotegdb_backend_error; Debug::Debug (CPU &cpu_, Mapper &mem_) : cpu (&cpu_), mem (&mem_), listener (-1), threadno_step (-1), threadno_gen (-1), rom_baseaddr (0), rom_nwords (0), got_interrupt (false), - debug_verbose (false) { + debug_verbose (true) { /* Upon connecting to our socket, gdb will ask for the current * signal; so we set the current signal to the breakpoint signal. */ @@ -116,6 +116,53 @@ Debug::remove_breakpoint(uint32 addr) } } +/* Set a watchpoint given in ADDR. */ +bool +Debug::declare_watchpoint(uint32 addr) +{ + Range* l = mem->find_mapping_range(addr); + if (l == NULL) { + return false; + } + + for (auto i = acdbg_set.begin(); i != acdbg_set.end(); i++) { + if (*i == l) { + triggered_acdbg[l] = true; + return true; + } + } + return false; +} + +/* Unset a watchpoint given in ADDR. */ +bool +Debug::remove_watchpoint(uint32 addr) +{ + Range* l = mem->find_mapping_range(addr); + if (triggered_acdbg[l]) { + triggered_acdbg[l] = false; + return true; + } + return true; +} + +bool Debug::watchpoint_exists() +{ + for (auto i = acdbg_set.begin(); i != acdbg_set.end(); i++) { + if (triggered_acdbg[*i]) { + if ((*i)->isTriggered()) { + return true; + } + } + } + return false; +} + +void Debug::register_ac_debbuger(AcceleratorDebugger *dbg) +{ + acdbg_set.insert(dbg); +} + /* True if ADDR is a virtual address within a known ROM block. This is pretty * lame right now; we should really ask the Mapper. */ @@ -569,6 +616,9 @@ Debug::single_step(void) if (breakpoint_exists(cpu->debug_get_pc())) { return Bp; /* Simulate hitting the breakpoint. */ } + if (watchpoint_exists()) { + return Bp; + } if (got_interrupt == true) { return Bp; /* interrupt. */ } @@ -678,6 +728,16 @@ Debug::target_set_or_remove_breakpoint(char *pkt, bool setting) case 2: /* write watchpoint */ case 3: /* read watchpoint */ case 4: /* access watchpoint */ + // it works only for accelerator debugger + if (setting) { + if (declare_watchpoint(addr)) { + return rawpacket("OK"); + } + } else { + if (remove_watchpoint(addr)) { + return rawpacket("OK"); + } + } default: return rawpacket(""); /* Not supported. */ } diff --git a/debug.h b/debug.h index 1b4cfb5..379c916 100644 --- a/debug.h +++ b/debug.h @@ -21,7 +21,10 @@ with VMIPS; if not, write to the Free Software Foundation, Inc., #define _DEBUG_H_ #include "deviceexc.h" +#include "debugutils.h" #include +#include + class CPU; class Mapper; @@ -37,6 +40,9 @@ class Debug : public DeviceExc { uint32 rom_nwords; typedef std::set wordset; wordset bp_set; + std::set acdbg_set; + std::map triggered_acdbg; + bool opt_bigendian; bool debug_verbose; @@ -54,6 +60,8 @@ class Debug : public DeviceExc { void reset(); void step(); + + void register_ac_debbuger(AcceleratorDebugger *dbg); private: int setup_listener_socket(void); int set_nonblocking(int fd); @@ -82,6 +90,9 @@ class Debug : public DeviceExc { bool breakpoint_exists(uint32 addr); void declare_breakpoint(uint32 addr); void remove_breakpoint(uint32 addr); + bool declare_watchpoint(uint32 addr); + bool remove_watchpoint(uint32 addr); + bool watchpoint_exists(); bool address_in_rom(uint32 addr); void get_breakpoint_bitmap_entry(uint32 addr, uint8 *&entry, uint8 &bitno); bool is_breakpoint_insn(char *packetptr); diff --git a/debugutils.cc b/debugutils.cc new file mode 100644 index 0000000..cb975d8 --- /dev/null +++ b/debugutils.cc @@ -0,0 +1,139 @@ +#include "debugutils.h" +#include "vmips.h" +#include "mapper.h" +#include "cpu.h" +#include + +AcceleratorDebugger::AcceleratorDebugger(DebuggerClient* acdebugger_) : + acdebugger(acdebugger_), DeviceMap (ACDBGR_SIZE) +{ + cmd = 0; + arg = 0; + prev_fetch_addr = 0x3; +} + +uint32 AcceleratorDebugger::fetch_word(uint32 offset, int mode, DeviceExc *client) +{ + + switch (offset) { + case RPLY_OFFSET: + return acdebugger->get_dbg_data(); + case CMD_OFFSET: + return cmd; + case ARG_OFFSET: + return arg; + case SEND_OFFSET: + return 0; + default: + return 0; + } +} + +uint8 AcceleratorDebugger::fetch_byte(uint32 offset, DeviceExc *client) +{ + if (prev_fetch_addr != (offset & ~0x3)) { + prev_fetch_addr = offset & ~0x3; + fetch_buf = fetch_word(prev_fetch_addr, DATALOAD, client); + fetch_buf = machine->physmem->host_to_mips_word(fetch_buf); + } + + uint32 byte_offset_in_word = (offset & 0x03); + if (!machine->cpu->is_bigendian()) { + byte_offset_in_word = 3 - byte_offset_in_word; + } + uint8 rv; + switch (byte_offset_in_word) { + case 0: rv = (fetch_buf >> 24) & 0xff; break; + case 1: rv = (fetch_buf >> 16) & 0xff; break; + case 2: rv = (fetch_buf >> 8) & 0xff; break; + case 3: rv = fetch_buf & 0xff; break; + default: rv = 0; + } + return rv; +} + +void AcceleratorDebugger::store_word(uint32 offset, uint32 data, DeviceExc *client) +{ + fprintf(stderr, "sw %X %x\n", offset, data); + switch (offset) { + case CMD_OFFSET: + cmd = data; + break; + case ARG_OFFSET: + arg = data; + break; + case SEND_OFFSET: + acdebugger->send_commnad(cmd, arg); + break; + case RPLY_OFFSET: + //read only + return; + default: + return; + } +} + +void AcceleratorDebugger::store_byte(uint32 offset, uint8 data, DeviceExc *client) +{ + uint32 word_data; + uint32 store_addr = offset & ~0x3; + word_data = fetch_word(store_addr, DATALOAD, client); + + word_data = machine->physmem->host_to_mips_word(word_data); + + uint32 byte_offset_in_word = (offset & 0x03); + + if (!machine->cpu->is_bigendian()) { + byte_offset_in_word = 3 - byte_offset_in_word; + } + switch (byte_offset_in_word) { + case 0: + word_data = ((data << 24) & 0xff000000) | + (word_data & ~0xff000000); + break; + case 1: + word_data = ((data << 16) & 0x00ff0000) | + (word_data & ~0x00ff0000); + break; + case 2: + word_data = ((data << 8) & 0x0000ff00) | + (word_data & ~0x0000ff00); + break; + case 3: + word_data = (data & 0x000000ff) | + (word_data & ~0x000000ff); + break; + default: return; + } + store_word(store_addr, word_data, client); + +} + +void DebuggerClient::__cmd_parser(uint32 cmd, uint8 &op, + uint8 &func, uint8 &mod, uint8 &offset) +{ + op = (cmd >> 24) & 0xFF; + func = (cmd >> 16) & 0xFF; + mod = (cmd >> 8) & 0xFF; + offset = cmd & 0xFF; +} + +bool DebuggerClient::__compare(uint32 a, uint32 b, uint8 comparator) +{ + switch (comparator) { + case DBG_CMD_COMPEQ: + return a == b; + case DBG_CMD_COMPNE: + return a != b; + case DBG_CMD_COMPGT: + return a > b; + case DBG_CMD_COMPLT: + return a < b; + case DBG_CMD_COMPGE: + return a >= b; + case DBG_CMD_COMPLE: + return a <= b; + default: + return false; + } +} diff --git a/debugutils.h b/debugutils.h new file mode 100644 index 0000000..9b0847f --- /dev/null +++ b/debugutils.h @@ -0,0 +1,57 @@ +#ifndef _DEBUGUTILS_H_ +#define _DEBUGUTILS_H_ + +#include "devicemap.h" + +#define ACDBGR_SIZE 16 //4 words +#define CMD_OFFSET 0 +#define ARG_OFFSET 4 +#define SEND_OFFSET 8 +#define RPLY_OFFSET 12 + +//General command format +//|op 8bit|func 8bit|module 8bit|offset 8bit| +#define DBG_CMD_NOP 0 +// set trigger: op = 0 +#define DBG_CMD_SETTRG_OP 1 +// func: comparator +// ==: 0, != 1, >: 2, <: 3, >= 4, <= 5 +#define DBG_CMD_COMPEQ 0 +#define DBG_CMD_COMPNE 1 +#define DBG_CMD_COMPGT 2 +#define DBG_CMD_COMPLT 3 +#define DBG_CMD_COMPGE 4 +#define DBG_CMD_COMPLE 5 +// write op = 1 +#define DBG_CMD_WRITE_OP 2 +// read op = 2 +#define DBG_CMD_READ_OP 3 + + +class DebuggerClient { + protected: + void __cmd_parser(uint32 cmd, uint8 &op, + uint8 &func, uint8 &mod, uint8 &offset); + bool __compare(uint32 a, uint32 b, uint8 comparator); + public: + virtual void send_commnad(uint32 cmd, uint32 arg) = 0; + virtual bool isTriggered() = 0; + virtual uint32 get_dbg_data() = 0; +}; + +class AcceleratorDebugger : public DeviceMap { + public: + AcceleratorDebugger(DebuggerClient *acdebugger_); + virtual uint32 fetch_word(uint32 offset, int mode, DeviceExc *client); + virtual uint8 fetch_byte(uint32 offset, DeviceExc *client); + virtual void store_word(uint32 offset, uint32 data, DeviceExc *client); + virtual void store_byte(uint32 offset, uint8 data, DeviceExc *client); + bool isTriggered() { return acdebugger->isTriggered(); }; + private: + DebuggerClient* acdebugger; + uint32 cmd, arg; + uint32 prev_fetch_addr; + uint32 fetch_buf; +}; + +#endif //_DEBUGUTILS_H_ \ No newline at end of file diff --git a/dmac.cc b/dmac.cc index a03cea9..b4ce5a6 100644 --- a/dmac.cc +++ b/dmac.cc @@ -10,6 +10,7 @@ DMAC::DMAC(Mapper &m) : bus(&m) bus->map_at_physical_address(config, DMAC_ADDR_BASE); block_words = machine->opt->option("dcachebsize")->num >> 2; buffer = new uint32[block_words]; + mem_bandwidth = machine->opt->option("mem_bandwidth")->num; } void DMAC::exception(uint16 excCode, int mode, int coprocno) @@ -75,18 +76,21 @@ void DMAC::step() } break; case DMAC_STAT_READING: - if (query.burst) { - addr = query.src + block_words * counter * 4 + - 4 * word_counter; - } else { - addr = query.src + 4 * counter; - } - if (bus->ready(addr, DATALOAD, this)) { - buffer[word_counter] = - bus->fetch_word(addr, DATALOAD, this); - if (++word_counter == block_words || - !query.burst) { - next_status = DMAC_STAT_READ_DONE; + for (int i = 0; i < mem_bandwidth; i++) { + if (query.burst) { + addr = query.src + block_words * counter * 4 + + 4 * word_counter; + } else { + addr = query.src + 4 * counter; + } + if (bus->ready(addr, DATALOAD, this)) { + buffer[word_counter] = + bus->fetch_word(addr, DATALOAD, this); + if (++word_counter == block_words || + !query.burst) { + next_status = DMAC_STAT_READ_DONE; + break; + } } } break; @@ -122,22 +126,25 @@ void DMAC::step() break; case DMAC_STAT_WRITING: uint32 data; - if (query.zero_write) { - data = 0; - } else { - data = buffer[word_counter]; - } - if (query.burst) { - addr = query.dst + block_words * counter * 4 + - 4 * word_counter; - } else { - addr = query.dst + 4 * counter; - } - if (bus->ready(addr, DATASTORE, this)) { - bus->store_word(addr, data, this); - if (++word_counter == block_words || - !query.burst) { - next_status = DMAC_STAT_WRITE_DONE; + for (int i = 0; i < mem_bandwidth; i++) { + if (query.zero_write) { + data = 0; + } else { + data = buffer[word_counter]; + } + if (query.burst) { + addr = query.dst + block_words * counter * 4 + + 4 * word_counter; + } else { + addr = query.dst + 4 * counter; + } + if (bus->ready(addr, DATASTORE, this)) { + bus->store_word(addr, data, this); + if (++word_counter == block_words || + !query.burst) { + next_status = DMAC_STAT_WRITE_DONE; + break; + } } } break; diff --git a/dmac.h b/dmac.h index aaacb11..63d94f7 100644 --- a/dmac.h +++ b/dmac.h @@ -118,6 +118,7 @@ class DMAC : public DeviceExc, public DeviceInt { DMA_query_t query; int counter; int word_counter; + int mem_bandwidth; bool address_valid(uint32 addr); public: diff --git a/mapper.cc b/mapper.cc index ce24b17..32ded05 100644 --- a/mapper.cc +++ b/mapper.cc @@ -39,7 +39,7 @@ Mapper::Mapper () : opt_bigendian = machine->opt->option("bigendian")->flag; byteswapped = (((opt_bigendian) && (!machine->host_bigendian)) || ((!opt_bigendian) && machine->host_bigendian)); - mem_access_latency = machine->mem_access_latency; + bus_latency = machine->bus_latency; bus_arbiter = new BusArbiter(); } @@ -87,7 +87,8 @@ bool Mapper::ready(uint32 addr, int32 mode, DeviceExc *client) int32 issue_time = access_requests_time[key]; - bool isReady = ((machine->num_cycles - issue_time) >= mem_access_latency); + bool isReady = ((machine->num_cycles - issue_time) >= + (bus_latency + l->extra_latency())); if (isReady) { return l->ready(addr, mode, client); @@ -356,6 +357,10 @@ Mapper::fetch_byte(uint32 addr, DeviceExc *client) uint32 result, oaddr = addr; l = find_mapping_range(addr); + if (l == NULL) { + bus_error (client, DATALOAD, addr); + return 0xff; + } offset = oaddr - l->getBase(); if (!l->canRead(offset)) { @@ -481,6 +486,10 @@ Mapper::store_byte(uint32 addr, uint8 data, DeviceExc *client) uint32 offset; l = find_mapping_range(addr); + if (l == NULL) { + bus_error (client, DATASTORE, addr); + return; + } offset = addr - l->getBase(); if (!l->canWrite(offset)) { diff --git a/mapper.h b/mapper.h index 7bfdb9e..459d7f9 100644 --- a/mapper.h +++ b/mapper.h @@ -58,7 +58,7 @@ class Mapper { bool byteswapped; private: - uint32 mem_access_latency; + uint32 bus_latency; std::unordered_map access_requests_time; BusArbiter *bus_arbiter; diff --git a/memorymodule.h b/memorymodule.h index 9c56624..552bbec 100644 --- a/memorymodule.h +++ b/memorymodule.h @@ -27,9 +27,12 @@ with VMIPS; if not, write to the Free Software Foundation, Inc., #include class MemoryModule : public Range { +private: + int latency; public: uint32 *myaddr; - MemoryModule(size_t size, FILE *init_data = NULL) : Range (0, size, 0, MEM_READ_WRITE) { + MemoryModule(size_t size, int latency_, FILE *init_data = NULL) + : Range (0, size, 0, MEM_READ_WRITE), latency(latency_) { myaddr = new uint32[size / 4](); if (init_data != NULL) { if (get_file_size(init_data) > size) { @@ -46,6 +49,7 @@ class MemoryModule : public Range { ~MemoryModule() { delete [] myaddr; } + virtual int extra_latency() { return latency; }; }; #endif /* _MEMORYMODULE_H_ */ diff --git a/options.cc b/options.cc index 394721e..b07a0dd 100644 --- a/options.cc +++ b/options.cc @@ -32,6 +32,7 @@ with VMIPS; if not, write to the Free Software Foundation, Inc., #include #include #include +#include #define OPTBUFSIZ 1024 @@ -424,6 +425,44 @@ Options::option(const char *name) return NULL; } +std::vector Options::get_tuple(const char *option, int len) +{ + std::vector v; + std::string str; + + std::regex format_re; + std::regex num_re; + std::smatch m; + + // create regex + str = "\\("; + for (int i = 0; i < len; i++) { + str += "\\s*[0-9]+\\s*,"; + } + str.erase(str.size()-1); + str += "\\)"; + + try { + format_re = std::regex(str); + num_re = std::regex("[0-9]+"); + } catch(std::regex_error& e) { + fatal_error("regex is not supported\nPlease rebuild with GCC 4.9 or higher\n"); + } + + str = std::string(option); + if (std::regex_match(str, format_re)) { + for (int i = 0; i < len; i++) { + std::regex_search(str, m, num_re); + v.push_back(std::stoi(m[0].str())); + str = m.suffix(); + } + } else { + fatal_error("Invalid tuple option: %s", option); + } + + return v; +} + void Options::print_config_info(void) { diff --git a/options.h b/options.h index ae539d5..32f7d6e 100644 --- a/options.h +++ b/options.h @@ -23,6 +23,7 @@ with VMIPS; if not, write to the Free Software Foundation, Inc., #include "types.h" #include #include +#include /* This defines the name of the system default configuration file. */ #define SYSTEM_CONFIG_FILE SYSCONFDIR"/vmipsrc" @@ -77,6 +78,7 @@ class Options { virtual ~Options () { } virtual void process_options(int argc, char **argv); union OptionValue *option(const char *name); + std::vector get_tuple(const char *option, int len); }; #endif /* _OPTIONS_H_ */ diff --git a/optiontbl.h b/optiontbl.h index 8e6427e..54ef2fb 100644 --- a/optiontbl.h +++ b/optiontbl.h @@ -252,6 +252,9 @@ static Option nametable[] = { /** If debugport is set to something nonzero, then the gdb remote serial protocol backend will use the specified TCP port. **/ + { "debuggeraddr", NUM }, + // accelerator debug address + { "realtime", FLAG }, /** If @option{realtime} is set, then the clock device will cause simulated time to run at some fraction of real time, determined by the @@ -371,6 +374,10 @@ static Option nametable[] = { { "cacheprof", FLAG }, /** Report cache profiling results after emulation **/ + { "routerprof", FLAG }, + /** Report router profiling results after emulation **/ + { "exmemprof", FLAG }, + /** Report externam memory profiling results after emulation **/ { "icacheway", NUM }, { "icachebsize", NUM }, @@ -382,7 +389,8 @@ static Option nametable[] = { /* cache configration*/ { "mem_bandwidth", NUM }, - { "mem_access_latency", NUM }, + { "bus_latency", NUM }, + { "exmem_latency", NUM }, /*Router configs*/ { "vcbufsize", NUM }, @@ -395,6 +403,9 @@ static Option nametable[] = { // SNACC options { "snacc_sram_latency", NUM }, + { "snacc_inst_dump", STR }, + { "snacc_mad_debug", STR }, + { NULL, 0 } }; @@ -408,17 +419,20 @@ static const char *defaults_table[] = { "loadaddr=0x81000000", "noinstcounts", "progmemsize=0x2000000", "memsize=0x100000", "nomemdump", "memdumpfile=memdump.bin", "noreportirq", "ttydev=/dev/tty", "ttydev2=off", - "nodebug", "debugport=0", "norealtime", "timeratio=1", "clockspeed=250000", + "nodebug", "debugport=0", "debuggeraddr=0xBC000000", + "norealtime", "timeratio=1", "clockspeed=250000", "clockintr=200000000", "clockdeviceirq=7", "clockdevice", "nodbemsg", "nodecrtc", "nodeccsr", "nodecstat", "nodecserial", "spimconsole", "notracing", "tracesize=100000", "nobigendian", "tracestartpc=0", "traceendpc=0", "execname=none", "nofpu", "notestdev", "nocacheprof", + "norouterprof", "noexmemprof", "dmac", "icacheway=2", "dcacheway=2", "icachebsize=64", "dcachebsize=64", "icachebnum=64", "dcachebnum=64", "mem_bandwidth=1", - "mem_access_latency=8", "vcbufsize=24", "noroutermsg", + "bus_latency=8", "exmem_latency=3", "vcbufsize=24", "noroutermsg", "accelerator0=none", "accelerator1=none", "accelerator2=none", - "snacc_sram_latency=1", + "snacc_sram_latency=1", "snacc_inst_dump=disabled", + "snacc_mad_debug=disabled", NULL }; diff --git a/range.cc b/range.cc index a5c197e..0f343ff 100644 --- a/range.cc +++ b/range.cc @@ -59,24 +59,28 @@ Range::overlaps(Range *r) uint32 Range::fetch_word(uint32 offset, int mode, DeviceExc *client) { + read_count++; return ((uint32 *)address)[offset / 4]; } uint16 Range::fetch_halfword(uint32 offset, DeviceExc *client) { + read_count++; return ((uint16 *)address)[offset / 2]; } uint8 Range::fetch_byte(uint32 offset, DeviceExc *client) { + read_count++; return ((uint8 *)address)[offset]; } void Range::store_word(uint32 offset, uint32 data, DeviceExc *client) { + write_count++; uint32 *werd; /* calculate address */ werd = ((uint32 *) address) + (offset / 4); @@ -87,6 +91,7 @@ Range::store_word(uint32 offset, uint32 data, DeviceExc *client) void Range::store_halfword(uint32 offset, uint16 data, DeviceExc *client) { + write_count++; uint16 *halfwerd; /* calculate address */ halfwerd = ((uint16 *) address) + (offset / 2); @@ -97,9 +102,15 @@ Range::store_halfword(uint32 offset, uint16 data, DeviceExc *client) void Range::store_byte(uint32 offset, uint8 data, DeviceExc *client) { + write_count++; uint8 *byte; byte = ((uint8 *) address) + offset; /* store halfword */ *byte = data; } +void Range::report_profile() +{ + fprintf(stderr, "\tRead Count:\t%d\n", read_count); + fprintf(stderr, "\tWrite Count:\t%d\n", write_count); +} \ No newline at end of file diff --git a/range.h b/range.h index 51fcdc8..468532b 100644 --- a/range.h +++ b/range.h @@ -23,6 +23,8 @@ with VMIPS; if not, write to the Free Software Foundation, Inc., #include "accesstypes.h" #include "types.h" #include +#include + class DeviceExc; /* Base class for managing a range of mapped memory. Memory-mapped @@ -34,10 +36,14 @@ class Range { uint32 extent; // number of bytes of memory provided void *address; // host machine pointer to start of memory int perms; // MEM_READ, MEM_WRITE, ... in accesstypes.h + // for profile + int read_count; + int write_count; public: Range(uint32 _base, uint32 _extent, caddr_t _address, int _perms) : - base(_base), extent(_extent), address(_address), perms(_perms) { } + base(_base), extent(_extent), address(_address), perms(_perms), + read_count(0), write_count(0) { } virtual ~Range() { } bool incorporates(uint32 addr); @@ -60,6 +66,9 @@ class Range { DeviceExc *client); virtual void store_byte(uint32 offset, uint8 data, DeviceExc *client); virtual bool ready(uint32 offset, int32 mode, DeviceExc *client) { return true; } ; + virtual int extra_latency() { return 0; }; + + void report_profile(); }; diff --git a/remoteram.cc b/remoteram.cc index ebe2fd0..89085ff 100644 --- a/remoteram.cc +++ b/remoteram.cc @@ -3,7 +3,7 @@ RemoteRam::RemoteRam(uint32 node_ID, Router* upperRouter, int mem_size) : CubeAccelerator(node_ID, upperRouter) { - mem = new MemoryModule(mem_size); + mem = new MemoryModule(mem_size, 0); } RemoteRam::~RemoteRam() diff --git a/remoteram.h b/remoteram.h index 5268ef6..576f59a 100644 --- a/remoteram.h +++ b/remoteram.h @@ -21,6 +21,11 @@ class RemoteRam : public CubeAccelerator { void core_reset() {}; const char *accelerator_name() { return "RemoteRam"; } + + virtual void send_commnad(uint32 cmd, uint32 arg) {}; + virtual bool isTriggered() { return false; }; + virtual uint32 get_dbg_data() { return 0; }; + }; diff --git a/rommodule.cc b/rommodule.cc index 1fe382f..6a234ef 100644 --- a/rommodule.cc +++ b/rommodule.cc @@ -23,7 +23,8 @@ with VMIPS; if not, write to the Free Software Foundation, Inc., #include #include -ROMModule::ROMModule (FILE *fp) : Range (0, 0, 0, MEM_READ_WRITE) { +ROMModule::ROMModule (FILE *fp, int latency_) : Range (0, 0, 0, MEM_READ_WRITE), + latency(latency_) { extent = get_file_size (fp); // Try to map the file into memory. We use PROT_READ to indicate // read-only access. -> enable to write (edit!) diff --git a/rommodule.h b/rommodule.h index 51ec27d..8ec8cee 100644 --- a/rommodule.h +++ b/rommodule.h @@ -24,11 +24,13 @@ with VMIPS; if not, write to the Free Software Foundation, Inc., #include class ROMModule : public Range { +private: + int latency; public: uint32* data; - ROMModule (FILE *f); + ROMModule (FILE *f, int latency_); virtual ~ROMModule (); - + virtual int extra_latency() { return latency; }; }; #endif // ROMMODULE_H diff --git a/router.cc b/router.cc index c744b7c..1f1f685 100644 --- a/router.cc +++ b/router.cc @@ -235,6 +235,19 @@ void Router::step() } +void Router::report_router() +{ + int send_to_upper = ocUpper->get_send_flit_count(); + int send_to_lower = ocLower->get_send_flit_count(); + + fprintf(stderr, "\tRouter%d\n", myid); + fprintf(stderr, "\t\tTotal %d flits\n", + send_to_upper + send_to_lower); + fprintf(stderr, "\t\t\tTo Upper %d flits\n", send_to_upper); + fprintf(stderr, "\t\t\tTo Lower %d flits\n", send_to_lower); + +} + /******************************* InputChannel *******************************/ InputChannel::InputChannel(RouterPortSlave* iport_, Crossbar *cb_, int *xpos_, bool *ordy_) : iport(iport_), xpos(xpos_), cb(cb_), ordy(ordy_) @@ -292,8 +305,6 @@ void InputChannel::step() bool vc_hold = false; int current_time = machine->num_cycles; - - for (i = 0; i < VCH_SIZE; i++) { if (!ibuf[i].empty()) { flit = ibuf[i].front(); @@ -401,6 +412,7 @@ void OutputChannel::reset() send_count[i] = 0; ack_count[i] = 0; } + send_flit_count = 0; } void OutputChannel::ackSend() @@ -417,6 +429,7 @@ void OutputChannel::ackSend() if (ackSendEn) { RouterUtils::make_ack_flit(&flit, FTYPE_ACK1, ack_count); oport->send(&flit, ACK_VCH); + send_flit_count++; for (int i = 0; i < VCH_SIZE / 2; i++) { if (ack_count[i] > ACK_COUNT_MAX) { ack_count[i] -= ACK_COUNT_MAX; @@ -435,6 +448,7 @@ void OutputChannel::ackSend() if (ackSendEn) { RouterUtils::make_ack_flit(&flit, FTYPE_ACK2, &ack_count[VCH_SIZE / 2]); oport->send(&flit, ACK_VCH); + send_flit_count++; for (int i = VCH_SIZE / 2; i < VCH_SIZE; i++) { if (ack_count[i] > ACK_COUNT_MAX) { } else { @@ -455,6 +469,7 @@ void OutputChannel::step() if (!obuf.empty()) { entry = obuf.front(); oport->send((FLIT_t*)(&entry.flit), entry.vch); + send_flit_count++; obuf.pop(); if (ackEnabled) { send_count[entry.vch]++; diff --git a/router.h b/router.h index 807597a..752ba9c 100644 --- a/router.h +++ b/router.h @@ -132,6 +132,8 @@ class OutputChannel { bool ackFormer() { return machine->num_cycles % 2 == 0; } void ackSend(); + // for report + int send_flit_count; public: OutputChannel(RouterPortMaster *oport_, bool ackEnabled_ = true); ~OutputChannel() {}; @@ -142,6 +144,7 @@ class OutputChannel { void pushAck(FLIT_t *flit); void ackIncrement(uint32 vch); bool ocReady(uint32 vch); + int get_send_flit_count() { return send_flit_count; }; }; @@ -255,6 +258,8 @@ class Router { RouterPortMaster *toLocal, *toLower, *toUpper; void setID(int id) { myid = id; }; + + void report_router(); }; diff --git a/snacc.cc b/snacc.cc index fc4f394..b37f19b 100644 --- a/snacc.cc +++ b/snacc.cc @@ -1,5 +1,8 @@ #include "snacc.h" #include "snaccmodules.h" +#include "error.h" + +#include using namespace SNACCComponents; @@ -8,6 +11,12 @@ SNACC::SNACC(uint32 node_ID, Router* upperRouter, int core_count_) SNACC_GLB_OUTOFRANGE, false), core_count(core_count_) { + // check debug option + // std::string opt_inst_dump = + // std::string(opt->option("snacc_inst_dump")->str); + // std::string opt_mad_debug = + // std::string(opt->option("snacc_mad_debug")->str); + cores = new SNACCCore*[core_count]; dmem_upper = new DoubleBuffer*[core_count]; dmem_lower = new DoubleBuffer*[core_count]; @@ -112,3 +121,35 @@ void SNACC::core_reset() } } + +void SNACC::send_commnad(uint32 cmd, uint32 arg) { + +} + +bool SNACC::isTriggered() +{ + return false; +} + +uint32 SNACC::get_dbg_data() +{ + return 0; +} + +void SNACC::enable_inst_dump(int core_id) +{ + if (core_id >= 0 && core_id < core_count) { + cores[core_id]->enable_inst_dump(); + } else { + warning("core ID %d for SNACC inst dump exceeds actual core count\n", core_id); + } +} + +void SNACC::enable_mad_debug(int core_id) +{ + if (core_id >= 0 && core_id < core_count) { + cores[core_id]->enable_mad_debug(); + } else { + warning("core ID %d for SNACC mad debug exceeds actual core count\n", core_id); + } +} \ No newline at end of file diff --git a/snacc.h b/snacc.h index d383d82..195fbcd 100644 --- a/snacc.h +++ b/snacc.h @@ -43,6 +43,13 @@ class SNACC : public CubeAccelerator{ const char *accelerator_name() { return "SNACC"; } void core_step(); void core_reset(); + + //for debuger + virtual void send_commnad(uint32 cmd, uint32 arg); + virtual bool isTriggered(); + virtual uint32 get_dbg_data(); + void enable_inst_dump(int core_id); + void enable_mad_debug(int core_id); }; diff --git a/snacccore.cc b/snacccore.cc index 891013c..e5620b1 100644 --- a/snacccore.cc +++ b/snacccore.cc @@ -17,7 +17,7 @@ SNACCCore::SNACCCore(int core_id_, WbufArb *wbuf_arb_) : core_id(core_id_), dmem_u(dmem_u_), dmem_l(dmem_l_), rbuf_u(rbuf_u_), rbuf_l(rbuf_l_), lut(lut_), imem(imem_), wbuf(wbuf_), - wbuf_arb(wbuf_arb_) + wbuf_arb(wbuf_arb_), inst_dump(false) { dbg_msg = machine->opt->option("excmsg")->flag; mad_unit = new MadUnit(machine->opt->option("snacc_sram_latency")->num, @@ -217,6 +217,32 @@ void SNACCCore::wb_stage() //reset status isBranch = false; reg_write = false; + + if (inst_dump) { + disassemble(); + } +} + +void SNACCCore::disassemble() +{ + fprintf(stderr, "%d:\tSNACC\t", machine->num_cycles); + switch(dec_opcode) { + case SNACC_CORE_OPCODE_RTYPE0: + fprintf(stderr, RType0InstrFormat[dec_func], dec_rd, dec_rs); + break; + case SNACC_CORE_OPCODE_RTYPE1: + fprintf(stderr, RType1InstrFormat[dec_func], dec_rd, dec_rs); + break; + case SNACC_CORE_OPCODE_RTYPE2: + fprintf(stderr, RType2InstrFormat[dec_func], dec_rd, dec_rs); + break; + case SNACC_CORE_OPCODE_JUMP: + fprintf(stderr, InstrFormat[dec_func], dec_imm); + break; + default: + fprintf(stderr, InstrFormat[dec_opcode], dec_rd, dec_imm); + } + fprintf(stderr, "\n"); } diff --git a/snacccore.h b/snacccore.h index 7be4697..218b49f 100644 --- a/snacccore.h +++ b/snacccore.h @@ -26,7 +26,9 @@ #define SNACC_CTRLREG_SIZE 16 //Opcode +#define SNACC_CORE_OPCODE_RTYPE0 0 #define SNACC_CORE_OPCODE_RTYPE1 1 +#define SNACC_CORE_OPCODE_RTYPE2 2 #define SNACC_CORE_OPCODE_BNEQ 4 #define SNACC_CORE_OPCODE_JUMP 5 @@ -106,6 +108,8 @@ class SNACCCore { int status; int stall_cause; + void disassemble(); + // SRAM modules uint32 access_address; DoubleBuffer *access_mem; @@ -145,6 +149,9 @@ class SNACCCore { static const MemberFuncPtr kRTypeMemoryTable[16]; static const MemberFuncPtr kRTypeSimdTable[16]; + // for debug + bool inst_dump; + public: SNACCCore(int core_id_, DoubleBuffer *dmem_u_, @@ -159,6 +166,8 @@ class SNACCCore { void step(); void reset(); bool isDone() { return done; }; + void enable_inst_dump() { inst_dump = true; }; + void enable_mad_debug() { mad_unit->enable_debug(); }; private: void Unknown(); @@ -204,6 +213,35 @@ class SNACCCore { }; +static const char* InstrFormat[16] = { + "", "", "", "Loadi r%d, 0x%X", + "Bneq r%d, 0x%X", "Jump 0x%X", + "Mad r%d, 0x%X", "Madlp r%d, 0x%X", + "Setcr r%d, 0x%X", "Addi r%d, 0x%X", + "Subi r%d, 0x%X", "Sll r%d, 0x%X", + "Srl r%d, 0x%X", "Sra r%d, 0x%X", + "Unknown", "Unknown" }; + + +static const char* RType0InstrFormat[16] = { + "Nop", "Mov r%d, r%d", "Add r%d, r%d", "Sub r%d, r%d", + "Mul r%d, r%d", "And r%d, r%d", "Or r%d, r%d","Xor r%d, r%d", + "Neg r%d, r%d", "Unknown", "Unknown", "Unknown", + "Unknown", "Unknown", "Unknown", "Unknown" }; + +static const char* RType1InstrFormat[16] = { + "Halt", "Loadw r%d, r%d", "Storew r%d, r%d", "Loadh r%d, r%d", + "Storeh r%d, r%d", "Unknown", "Unknown", "Readcr", + "Unknown", "Unknown", + "Dbchange %d, %d", "Dma r%d, r%d", + "Unknown", "Unknown", + "Unknown", "Unknown" }; + +static const char* RType2InstrFormat[16] = { + "Nop", "Loadv r%d, r%d", "Unknown", "Unknown", + "Unknown", "Unknown", "Unknown", "Unknown", + "Unknown", "Unknown", "Unknown", "Unknown", + "Unknown", "Unknown", "Unknown", "Unknown" }; #endif //_SNACCCORE_H_ diff --git a/snaccmodules.cc b/snaccmodules.cc index c760bbe..09e5045 100644 --- a/snaccmodules.cc +++ b/snaccmodules.cc @@ -247,7 +247,7 @@ MadUnit::MadUnit(uint32 sram_latency_, DoubleBuffer *dmem_u_, sram_latency(sram_latency_), dmem_u(dmem_u_), dmem_l(dmem_l_), rbuf_u(rbuf_u_), rbuf_l(rbuf_l_), lut(lut_), - TR0(TR0_), TR1(TR1_), FR0(FR0_), FR1(FR1_) + TR0(TR0_), TR1(TR1_), FR0(FR0_), FR1(FR1_), debug_print(false) { } @@ -453,10 +453,14 @@ void MadUnit::loadData(Fixed16 *array) void MadUnit::doMad() { Fixed16 weight[SNACC_SIMD_LANE_SIZE/2]; + Fixed16 data[SNACC_SIMD_LANE_SIZE]; //load weight loadWeight(weight); + if (debug_print) { + fprintf(stderr, "MAD\nbefore TR0 0x%08X, TR1 0x%08X\n", + tr0_fp, tr1_fp); + } if (eight_bit_mode) { - Fixed16 data[SNACC_SIMD_LANE_SIZE]; loadData(data); for (int i = 0; i < SNACC_SIMD_LANE_SIZE/2; i++) { if (mask[i]) { @@ -466,11 +470,11 @@ void MadUnit::doMad() for (int i = SNACC_SIMD_LANE_SIZE/2; i < SNACC_SIMD_LANE_SIZE; i++) { if (mask[i]) { - tr1_fp = tr1_fp + weight[i] * data[i]; + tr1_fp = tr1_fp + + weight[i - SNACC_SIMD_LANE_SIZE/2] * data[i]; } } } else { - Fixed16 data[SNACC_SIMD_LANE_SIZE/2]; loadData(data); for (int i = 0; i < SNACC_SIMD_LANE_SIZE/2; i++) { if (mask[i]) { @@ -478,12 +482,29 @@ void MadUnit::doMad() } } } + if (debug_print) { + int half_lane = SNACC_SIMD_LANE_SIZE/ 2; + int max_lane = eight_bit_mode ? SNACC_SIMD_LANE_SIZE : half_lane; + for (int i = 0; i < max_lane; i++) { + if (mask[i]) { + fprintf(stderr, "mul%d: 0x%04X * 0x%04X = 0x%04X\n", i, + weight[i % half_lane], data[i], + weight[i % half_lane] * data[i]); + } else { + fprintf(stderr, "mul%d: masked\n", i); + } + } + fprintf(stderr, "after TR0 0x%08X, TR1 0x%08X\n", + tr0_fp, tr1_fp); + } } void MadUnit::doMaxPool() { + Fixed32 prev_tr0 = tr0_fp; + Fixed32 prev_tr1 = tr1_fp; + Fixed16 data[SNACC_SIMD_LANE_SIZE]; if (eight_bit_mode) { - Fixed16 data[SNACC_SIMD_LANE_SIZE]; loadData(data); for (int i = 0; i < SNACC_SIMD_LANE_SIZE/2; i++) { if (mask[i]) { @@ -499,7 +520,6 @@ void MadUnit::doMaxPool() } } } else { - Fixed16 data[SNACC_SIMD_LANE_SIZE/2]; loadData(data); for (int i = 0; i < SNACC_SIMD_LANE_SIZE/2; i++) { if (mask[i]) { @@ -508,6 +528,29 @@ void MadUnit::doMaxPool() } } } + if (debug_print) { + fprintf(stderr, "MAXPOOL\nmax(0x%08X, ", prev_tr0); + for (int i = 0; i < SNACC_SIMD_LANE_SIZE/2; i++) { + if (mask[i]) { + fprintf(stderr, "0x%08X, ", data[i].ToFixed32()); + } else { + fprintf(stderr, "masked, "); + } + } + fprintf(stderr, "\b\b) = 0x%08X\n", tr0_fp); + if (eight_bit_mode) { + fprintf(stderr, "max(0x%08X, ", prev_tr1); + for (int i = SNACC_SIMD_LANE_SIZE/2; + i < SNACC_SIMD_LANE_SIZE; i++) { + if (mask[i]) { + fprintf(stderr, "0x%08X, ", data[i].ToFixed32()); + } else { + fprintf(stderr, "masked, "); + } + } + fprintf(stderr, "\b\b) = 0x%08X\n", tr1_fp); + } + } } void MadUnit::doAvgPool() @@ -519,4 +562,4 @@ bool MadUnit::running() return (state != SNACC_MAD_STAT_IDLE) || (state == SNACC_MAD_STAT_IDLE && next_state != SNACC_MAD_STAT_IDLE); -} \ No newline at end of file +} diff --git a/snaccmodules.h b/snaccmodules.h index f952c09..37f6362 100644 --- a/snaccmodules.h +++ b/snaccmodules.h @@ -6,6 +6,7 @@ #include "snaccAddressMap.h" #include +#include #define SNACC_WBUF_ARB_4CORE 0 #define SNACC_WBUF_ARB_2CORE 2 @@ -173,7 +174,7 @@ uint32 SignedClipMostSignificant4Bits(uint32 before); // <8.24> bits signed fixed point decimal number, which is // internal representation of multiply-and-add unit. - struct Fixed32 { + class Fixed32 { public: Fixed32() : num_(0) {} @@ -238,6 +239,9 @@ uint32 SignedClipMostSignificant4Bits(uint32 before); Fixed32 tr0_fp, tr1_fp, fr0_fp, fr1_fp; + // for debug + bool debug_print; + bool overDmemBoundary(); bool overRbufBoundary(); void updataAddress(); @@ -274,7 +278,7 @@ uint32 SignedClipMostSignificant4Bits(uint32 before); void step(); void reset(); bool running(); - + void enable_debug() { debug_print = true; }; }; } diff --git a/vmips.cc b/vmips.cc index cbd36de..55b85b4 100644 --- a/vmips.cc +++ b/vmips.cc @@ -61,6 +61,8 @@ with VMIPS; if not, write to the Free Software Foundation, Inc., #include "cma.h" #include "snacc.h" #include "dmac.h" +#include "debugutils.h" +#include vmips *machine; @@ -80,6 +82,8 @@ vmips::refresh_options(void) opt_memdump = opt->option("memdump")->flag; opt_realtime = opt->option("realtime")->flag; opt_cache_prof = opt->option("cacheprof")->flag; + opt_router_prof = opt->option("routerprof")->flag; + opt_exmem_prof = opt->option("exmemprof")->flag; opt_clockspeed = opt->option("clockspeed")->num; clock_nanos = 1000000000/opt_clockspeed; @@ -88,6 +92,7 @@ vmips::refresh_options(void) opt_clockdeviceirq = opt->option("clockdeviceirq")->num; opt_loadaddr = opt->option("loadaddr")->num; opt_bootaddr = opt->option("bootaddr")->num; + opt_debuggeraddr = opt->option("debuggeraddr")->num; opt_memsize = opt->option("memsize")->num; opt_progmemsize = opt->option("progmemsize")->num; opt_timeratio = opt->option("timeratio")->num; @@ -106,9 +111,9 @@ vmips::refresh_options(void) opt_spimconsole = opt->option("spimconsole")->flag; opt_testdev = opt->option("testdev")->flag; mem_bandwidth = opt->option("mem_bandwidth")->num; - mem_access_latency = opt->option("mem_access_latency")->num; + bus_latency = opt->option("bus_latency")->num; vcbufsize = opt->option("vcbufsize")->num; - + exmem_latency = opt->option("exmem_latency")->num; } @@ -438,7 +443,7 @@ vmips::setup_prog () // Translate loadaddr to physical address. try { - mem_prog = new MemoryModule(opt_progmemsize, bin_file); + mem_prog = new MemoryModule(opt_progmemsize, exmem_latency, bin_file); } catch (int errcode) { error ("mmap failed for %s: %s", opt_image, strerror (errcode)); return false; @@ -465,9 +470,8 @@ vmips::setup_bootrom () return false; } // Translate loadaddr to physical address. - ROMModule *rm; try { - rm = new ROMModule (rom); + rm = new ROMModule (rom, exmem_latency); } catch (int errcode) { error ("mmap failed for %s: %s", opt_boot, strerror (errcode)); return false; @@ -488,7 +492,7 @@ bool vmips::setup_ram () { // Make a new RAM module and install it at base physical address 0. - memmod = new MemoryModule(opt_memsize); + memmod = new MemoryModule(opt_memsize, exmem_latency); physmem->map_at_physical_address(memmod, 0); // memmod2 = new MemoryModule(0x100000); @@ -571,15 +575,38 @@ bool vmips::setup_dmac() bool vmips::setup_cube() { + std::vector snacc_inst_dump(2, -1), snacc_mad_debug(2, -1); + bool snacc_inst_dump_fail, snacc_mad_debug_fail; + snacc_inst_dump_fail = snacc_mad_debug_fail = false; std::string ac0_name = std::string(opt->option("accelerator0")->str); std::string ac1_name = std::string(opt->option("accelerator1")->str); std::string ac2_name = std::string(opt->option("accelerator2")->str); + //get snacc options + std::string opt_str = std::string(opt->option("snacc_inst_dump")->str); + if (opt_str != std::string("disabled")) { + snacc_inst_dump = opt->get_tuple(opt_str.c_str(), 2); + snacc_inst_dump_fail = true; + } + opt_str = std::string(opt->option("snacc_mad_debug")->str); + if (opt_str != std::string("disabled")) { + snacc_mad_debug = opt->get_tuple(opt_str.c_str(), 2); + snacc_mad_debug_fail = true; + } + //setup accelerator0 if (ac0_name == std::string("CMA")) { ac0 = new CMA(1, rtif->getRouter()); } else if (ac0_name == std::string("SNACC")) { ac0 = new SNACC(1, rtif->getRouter(), 4); + if (snacc_inst_dump[0] == 0) { + ((SNACC*)(ac0))->enable_inst_dump(snacc_inst_dump[1]); + snacc_inst_dump_fail = false; + } + if (snacc_mad_debug[0] == 0) { + ((SNACC*)(ac0))->enable_mad_debug(snacc_mad_debug[1]); + snacc_mad_debug_fail = false; + } } else if (ac0_name == std::string("RemoteRam")) { ac0 = new RemoteRam(1, rtif->getRouter(), 0x2048); //2KB } else if (ac0_name != std::string("none")) { @@ -589,6 +616,11 @@ vmips::setup_cube() if (ac0 != NULL) { ac0->setup(); + if (opt_debug) { + ac0_dbg = new AcceleratorDebugger(ac0); + physmem->map_at_physical_address(ac0_dbg, opt_debuggeraddr); + dbgr->register_ac_debbuger(ac0_dbg); + } } //setup accelerator1 @@ -600,6 +632,15 @@ vmips::setup_cube() ac1 = new CMA(2, ac0->getRouter()); } else if (ac1_name == std::string("SNACC")) { ac1 = new SNACC(2, ac0->getRouter(), 4); + ac0 = new SNACC(1, rtif->getRouter(), 4); + if (snacc_inst_dump[0] == 0) { + ((SNACC*)(ac1))->enable_inst_dump(snacc_inst_dump[1]); + snacc_inst_dump_fail = false; + } + if (snacc_mad_debug[0] == 0) { + ((SNACC*)(ac1))->enable_mad_debug(snacc_mad_debug[1]); + snacc_mad_debug_fail = false; + } } else if (ac1_name == std::string("RemoteRam")) { ac1 = new RemoteRam(2, ac0->getRouter(), 0x2048); //2KB } else { @@ -610,6 +651,12 @@ vmips::setup_cube() if (ac1 != NULL) { ac1->setup(); + if (opt_debug) { + ac1_dbg = new AcceleratorDebugger(ac1); + physmem->map_at_physical_address(ac1_dbg, opt_debuggeraddr + + ACDBGR_SIZE); + dbgr->register_ac_debbuger(ac1_dbg); + } } //setup accelerator2 @@ -621,6 +668,15 @@ vmips::setup_cube() ac2 = new CMA(3, ac1->getRouter()); } else if (ac1_name == std::string("SNACC")) { ac2 = new SNACC(3, ac1->getRouter(), 4); + ac0 = new SNACC(1, rtif->getRouter(), 4); + if (snacc_inst_dump[0] == 0) { + ((SNACC*)(ac2))->enable_inst_dump(snacc_inst_dump[1]); + snacc_inst_dump_fail = false; + } + if (snacc_mad_debug[0] == 0) { + ((SNACC*)(ac2))->enable_mad_debug(snacc_mad_debug[1]); + snacc_mad_debug_fail = false; + } } else if (ac1_name == std::string("RemoteRam")) { ac2 = new RemoteRam(3, ac1->getRouter(), 0x2048); //2KB } else { @@ -631,6 +687,21 @@ vmips::setup_cube() if (ac2 != NULL) { ac2->setup(); + if (opt_debug) { + ac2_dbg = new AcceleratorDebugger(ac2); + physmem->map_at_physical_address(ac2_dbg, opt_debuggeraddr + + ACDBGR_SIZE * 2); + dbgr->register_ac_debbuger(ac2_dbg); + } + } + + if (snacc_inst_dump_fail) { + warning("SNACC inst dump option for node %d is ignored\n", + snacc_inst_dump[0]); + } + if (snacc_mad_debug_fail) { + warning("SNACC mad debug option for node %d is ignored\n", + snacc_mad_debug[0]); } return true; @@ -817,6 +888,31 @@ vmips::run() cpu->icache->report_prof(); fprintf(stderr, "Data Cache Profile\n"); cpu->dcache->report_prof(); + fprintf(stderr, "\n"); + } + + if (opt_router_prof) { + fprintf(stderr, "Router Profile\n"); + rtif->getRouter()->report_router(); + if (ac0 != NULL) { + ac0->getRouter()->report_router(); + } + if (ac1 != NULL) { + ac1->getRouter()->report_router(); + } + if (ac2 != NULL) { + ac2->getRouter()->report_router(); + } + } + + if (opt_exmem_prof) { + fprintf(stderr, "Memory profile\n"); + fprintf(stderr, " Main memory\n"); + ((Range*)(memmod))->report_profile(); + fprintf(stderr, " Program memory\n"); + ((Range*)(mem_prog))->report_profile(); + fprintf(stderr, " Boot rom\n"); + ((Range*)(rm))->report_profile(); } /* We're done. */ diff --git a/vmips.h b/vmips.h index adb1204..45c2597 100644 --- a/vmips.h +++ b/vmips.h @@ -30,6 +30,7 @@ class CPU; class IntCtrl; class Options; class MemoryModule; +class ROMModule; class Debug; class Clock; class ClockDevice; @@ -48,6 +49,7 @@ class RouterRange; class RouterIOReg; class CubeAccelerator; class DMAC; +class AcceleratorDebugger; long timediff(struct timeval *after, struct timeval *before); @@ -63,6 +65,7 @@ class vmips Options *opt; MemoryModule *memmod; MemoryModule *mem_prog; + ROMModule *rm; Debug *dbgr; Disassembler *disasm; bool host_bigendian; @@ -83,6 +86,7 @@ class vmips RouterIOReg *rtIO; RouterRange *rtrange_kseg0, *rtrange_kseg1; CubeAccelerator *ac0, *ac1, *ac2; + AcceleratorDebugger *ac0_dbg, *ac1_dbg, *ac2_dbg; DMAC *dmac; /* Cached versions of options: */ @@ -104,12 +108,15 @@ class vmips bool opt_spimconsole; bool opt_testdev; bool opt_cache_prof; + bool opt_exmem_prof; + bool opt_router_prof; uint32 opt_clockspeed; uint32 clock_nanos; uint32 opt_clockintr; uint32 opt_clockdeviceirq; uint32 opt_loadaddr; uint32 opt_bootaddr; + uint32 opt_debuggeraddr; uint32 opt_memsize; uint32 opt_progmemsize; uint32 opt_timeratio; @@ -122,7 +129,8 @@ class vmips uint32 num_cycles; uint32 stall_count; uint32 mem_bandwidth; - uint32 mem_access_latency; + uint32 bus_latency; + uint32 exmem_latency; uint32 vcbufsize; private: diff --git a/vmipsrc b/vmipsrc index e507e94..8761580 100644 --- a/vmipsrc +++ b/vmipsrc @@ -49,6 +49,7 @@ reportirq # tracestartpc=0 # ttydev2=off # ttydev=/dev/tty +exmemprof ## cache config cacheprof icacheway=2 @@ -57,6 +58,9 @@ icachebnum=64 dcachebnum=64 icachebsize=64 dcachebsize=64 +## bus config +exmem_latency=3 +bus_latency=8 ## ## The following options have defaults set by `configure': ##