compile_encoder.py

import onnx
import tvm
from tvm import relax
from tvm.relax.frontend.onnx import from_onnx  # Correct import path
from tvm.relax.dpl import is_op, wildcard
from tvm.contrib import cc

def riscv_fcompile(file_name, files, options=None, **kwargs):
    if options is None:
        options = []
    # 添加任何必要的 RISC-V 編譯選項,例如浮點 ABI
    options.append("-march=rv64imafdcv") # 範例,指定 ISA 擴展
    options.append("-mabi=lp64d")   # 範例,指定 ABI

    return cc.create_shared(
        file_name,
        files,
        options=options,
        cc="riscv64-unknown-linux-gnu-g++", # 指定你的 RISC-V 編譯器

        **kwargs
    )
def compile_model(onnx_path, target="llvm"):
	# 1. Load ONNX model
	onnx_model = onnx.load(onnx_path) 
	# 2. Convert to Relax IR (updated API)
	mod = from_onnx(onnx_model, {"input_features": (1, 80, 3000)})# give input shape of both encoder and decoder, make them static. Somer op does not support dynamic shape

	#mod = from_onnx(onnx_model, {"input_ids": (1, 1), "encoder_hidden_states": (1, 1500, 384)})# give input shape of both encoder and decoder, make them static. Somer op does not support dynamic shape
	
	#mod = from_onnx(onnx_model)
	#mod=tvm.relax.transform.BindSymbolicVars({"batch_size":1, "encoder_sequence_length_out": 1500})(mod)

	patterns = [("bananapi.matmul", is_op("relax.matmul")(wildcard(), wildcard()))]
	#patterns = [("tensorrt.add", is_op("relax.add")(wildcard(), wildcard()))]

	'''
	annotate_codegen: 不要 Merge 相鄰的 OP,一個 OP 一個 Relax function
	bind_constants: 綁定常數,如果前面 from_onnx 的 keep_params_in_input=False(預設) 這裡要設成 bind_constants=False
						 如果前面 from_onnx 的 keep_params_in_input=True		這裡要設成 bind_constants=True(預設)
	'''
	mod = relax.transform.FuseOpsByPattern(patterns, bind_constants=False, annotate_codegen=True)(mod)
	#mod = relax.transform.FuseOpsByPattern(patterns, bind_constants=False)(mod)
	#mod = relax.transform.FuseOpsByPattern(patterns)(mod)
	#mod.show()

	#mod = relax.transform.MergeCompositeFunctions()(mod)
	#mod.show()

	mod = relax.transform.RunCodegen()(mod)
	#mod.show()

	# 3. Apply mandatory passes
	seq = tvm.ir.transform.Sequential([
		relax.transform.LegalizeOps(),
		relax.transform.FoldConstant(),
		relax.transform.DeadCodeElimination()
	])
	mod = seq(mod)

	# Check if output IRModule is well-formed. 
	#assert relax.analysis.well_formed(mod)
	# 4. Build
	ex = relax.build(mod, target)
	
	# 5. Save
	output_path = onnx_path.replace(".onnx", ".so")
	ex.export_library(output_path, fcompile=riscv_fcompile)
	return output_path

# Compile both encoder and decoder
encoder_so = compile_model("encoder_model.onnx", target="llvm -mtriple=riscv64-unknown-linux-gnu -mattr=+m,+a,+f,+d,+c -vector-width=128")
#decoder_so = compile_model("decoder_model.onnx", target="llvm -mtriple=riscv64-unknown-linux-gnu -mattr=+m,+a,+f,+d,+c -vector-width=128")

compile_decoder.py

import onnx
import tvm
from tvm import relax
from tvm.relax.frontend.onnx import from_onnx  # Correct import path
from tvm.relax.dpl import is_op, wildcard
from tvm.contrib import cc

def riscv_fcompile(file_name, files, options=None, **kwargs):
    if options is None:
        options = []
    # 添加任何必要的 RISC-V 編譯選項,例如浮點 ABI
    options.append("-march=rv64imafdcv") # 範例,指定 ISA 擴展
    options.append("-mabi=lp64d")   # 範例,指定 ABI

    return cc.create_shared(
        file_name,
        files,
        options=options,
        cc="riscv64-unknown-linux-gnu-g++", # 指定你的 RISC-V 編譯器

        **kwargs
    )
def compile_model(onnx_path, target="llvm"):
	# 1. Load ONNX model
	onnx_model = onnx.load(onnx_path) 
	# 2. Convert to Relax IR (updated API)
	#mod = from_onnx(onnx_model, {"input_features": (1, 80, 3000)})# give input shape of both encoder and decoder, make them static. Somer op does not support dynamic shape

	mod = from_onnx(onnx_model, {"input_ids": (1, 1), "encoder_hidden_states": (1, 1500, 384)})# give input shape of both encoder and decoder, make them static. Somer op does not support dynamic shape
	
	#mod = from_onnx(onnx_model)
	#mod=tvm.relax.transform.BindSymbolicVars({"batch_size":1, "encoder_sequence_length_out": 1500})(mod)

	patterns = [("bananapi.matmul", is_op("relax.matmul")(wildcard(), wildcard()))]
	#patterns = [("tensorrt.add", is_op("relax.add")(wildcard(), wildcard()))]

	'''
	annotate_codegen: 不要 Merge 相鄰的 OP,一個 OP 一個 Relax function
	bind_constants: 綁定常數,如果前面 from_onnx 的 keep_params_in_input=False(預設) 這裡要設成 bind_constants=False
						 如果前面 from_onnx 的 keep_params_in_input=True		這裡要設成 bind_constants=True(預設)
	'''
	mod = relax.transform.FuseOpsByPattern(patterns, bind_constants=False, annotate_codegen=True)(mod)
	#mod = relax.transform.FuseOpsByPattern(patterns, bind_constants=False)(mod)
	#mod = relax.transform.FuseOpsByPattern(patterns)(mod)
	#mod.show()

	#mod = relax.transform.MergeCompositeFunctions()(mod)
	#mod.show()

	mod = relax.transform.RunCodegen()(mod)
	#mod.show()

	# 3. Apply mandatory passes
	seq = tvm.ir.transform.Sequential([
		relax.transform.LegalizeOps(),
		relax.transform.FoldConstant(),
		relax.transform.DeadCodeElimination()
	])
	mod = seq(mod)

	# Check if output IRModule is well-formed. 
	#assert relax.analysis.well_formed(mod)
	# 4. Build
	ex = relax.build(mod, target)
	
	# 5. Save
	output_path = onnx_path.replace(".onnx", ".so")
	ex.export_library(output_path, fcompile=riscv_fcompile)
	return output_path

# Compile both encoder and decoder
#encoder_so = compile_model("encoder_model.onnx", target="llvm -mtriple=riscv64-unknown-linux-gnu -mattr=+m,+a,+f,+d,+c -vector-width=128")
decoder_so = compile_model("decoder_model.onnx", target="llvm -mtriple=riscv64-unknown-linux-gnu -mattr=+m,+a,+f,+d,+c -vector-width=128")

compile_decoder_with_past.py

import onnx
import tvm
from tvm import relax
from tvm.relax.frontend.onnx import from_onnx  # Correct import path
from tvm.relax.dpl import is_op, wildcard
from tvm.contrib import cc

def riscv_fcompile(file_name, files, options=None, **kwargs):
    if options is None:
        options = []
    # 添加任何必要的 RISC-V 編譯選項,例如浮點 ABI
    options.append("-march=rv64imafdcv") # 範例,指定 ISA 擴展
    options.append("-mabi=lp64d")   # 範例,指定 ABI

    return cc.create_shared(
        file_name,
        files,
        options=options,
        cc="riscv64-unknown-linux-gnu-g++", # 指定你的 RISC-V 編譯器

        **kwargs
    )

def compile_model(onnx_path, target="llvm"):
	# 1. Load ONNX model
	onnx_model = onnx.load(onnx_path) 
	# 2. Convert to Relax IR (updated API)

	#mod = from_onnx(onnx_model, {"input_features": (1, 80, 3000)})# give input shape of both encoder and decoder, make them static. Somer op does not support dynamic shape

	#mod = from_onnx(onnx_model, {"input_ids": (1, 1), "encoder_hidden_states": (1, 1500, 384)})# give input shape of both encoder and decoder, make them static. Somer op does not support dynamic shape
	
	mod = from_onnx(onnx_model)
	mod=tvm.relax.transform.BindSymbolicVars({"batch_size":1, "encoder_sequence_length_out": 1500})(mod)

	#patterns = [("bananapi.matmul", is_op("relax.matmul")(wildcard(), wildcard()))]

	#patterns = [("tensorrt.add", is_op("relax.add")(wildcard(), wildcard()))]

	'''
	annotate_codegen: 不要 Merge 相鄰的 OP,一個 OP 一個 Relax function
	bind_constants: 綁定常數,如果前面 from_onnx 的 keep_params_in_input=False(預設) 這裡要設成 bind_constants=False
						 如果前面 from_onnx 的 keep_params_in_input=True		這裡要設成 bind_constants=True(預設)
	'''
	#mod = relax.transform.FuseOpsByPattern(patterns, bind_constants=False, annotate_codegen=True)(mod)
	#mod = relax.transform.FuseOpsByPattern(patterns, bind_constants=False)(mod)
	#mod = relax.transform.FuseOpsByPattern(patterns)(mod)
	#mod.show()

	#mod = relax.transform.MergeCompositeFunctions()(mod)
	#mod.show()

	#mod = relax.transform.RunCodegen()(mod)
	#mod.show()

	# 3. Apply mandatory passes
	seq = tvm.ir.transform.Sequential([
		relax.transform.LegalizeOps(),
		#relax.transform.FoldConstant(),
		#relax.transform.DeadCodeElimination()
	])
	mod = seq(mod)

	# Check if output IRModule is well-formed. 
	#assert relax.analysis.well_formed(mod)
	# 4. Build
	ex = relax.build(mod, target)
	
	# 5. Save
	output_path = onnx_path.replace(".onnx", ".so")
	ex.export_library(output_path, fcompile=riscv_fcompile)
	return output_path

# Compile both encoder and decoder
#encoder_so = compile_model("encoder_model.onnx", target="llvm")
#decoder_so = compile_model("decoder_model.onnx", target="llvm")
decoder_so = compile_model("decoder_with_past_model.onnx", target="llvm -mtriple=riscv64-unknown-linux-gnu -mattr=+m,+a,+f,+d,+c")

inference.py

import numpy as np
import tvm
from tvm import relax, runtime
from tvm.relax import VirtualMachine
from transformers import WhisperProcessor, WhisperTokenizer

from datetime import datetime
import csv
from collections import defaultdict
import numpy as np
import soundfile as sf
from scipy import signal

# Global aggregation dict: {Name: [total_duration_us, total_count]}
profile_agg = defaultdict(lambda: [0, 0])
start_time_all = datetime.now()
print("Start of all:", start_time_all)

# === 初始化空的 16 個 KV(給 prefill 和 step-by-step 共用)===
def init_zero_past_kv(num_layers=4, num_heads=6, head_dim=64,
                      decoder_seq_len=0, encoder_seq_len=1500, dtype="float32"):
    shape_decoder = (1, num_heads, decoder_seq_len, head_dim)
    shape_encoder = (1, num_heads, encoder_seq_len, head_dim)
    kvs = []
    for _ in range(num_layers):
        kvs += [
            tvm.nd.array(np.zeros(shape_decoder, dtype=dtype)),  # self.key
            tvm.nd.array(np.zeros(shape_decoder, dtype=dtype)),  # self.value
            tvm.nd.array(np.zeros(shape_encoder, dtype=dtype)),  # cross.key
            tvm.nd.array(np.zeros(shape_encoder, dtype=dtype))   # cross.value
        ]
    return kvs

def insert_profile_report(csv_str):
    """Insert one profile report (CSV string format) into the global aggregator."""
    reader = csv.DictReader(csv_str.strip().splitlines())
    for row in reader:
        name = row["Name"]
        duration = float(row["Duration (us)"])
        count = int(row["Count"])
        profile_agg[name][0] += duration
        profile_agg[name][1] += count

# === 載入模型與 tokenizer ===
processor = WhisperProcessor.from_pretrained("./")
tokenizer = WhisperTokenizer.from_pretrained("./")

# === 音訊轉 mel spectrogram ===

# === 1. Load audio ===
waveform, sr = sf.read("audio.wav")

# === 2. Resample to 16kHz if needed ===
target_sr = 16000
if sr != target_sr:
    num_samples = int(len(waveform) * target_sr / sr)
    waveform = signal.resample(waveform, num_samples)
    sr = target_sr

# === 3. Convert to mono ===
if waveform.ndim > 1:
    waveform = waveform.mean(axis=1)

# === 4. Pass into Hugging Face processor (same as torchaudio flow) ===
inputs = processor(waveform, sampling_rate=16000, return_tensors="np")

# === 5. Get float32 mel features ===
mel = inputs.input_features.astype("float32")

print("Mel shape:", mel.shape)

# === Encoder ===
encoder_vm = VirtualMachine(runtime.load_module("./onnx/encoder_model.so"), tvm.cpu(), profile=True)

# === Profile code block ===

# Profile the encoder execution
# profile_report = encoder_vm.profile("main", tvm.nd.array(mel)).csv()
# insert_profile_report(profile_report)
# Convert to CSV and save to file
# with open("./profile_data/encoder.csv", "w") as f:
#     f.write(profile_report)

# === End of Profile code block ===

start_time = datetime.now()
print("Start of encoder:", start_time)
encoder_out = encoder_vm["main"](tvm.nd.array(mel))  # shape: (1, 1500, 384)
end_time = datetime.now()
print("End of encoder:", end_time)
print("Encoder takes: ", (end_time-start_time).total_seconds())

# === Decoder Step 0: Prefill ===
# Initialize decoder VM with profiling enabled

start_token = 50258
eos_token = tokenizer.eos_token_id
tokens = [start_token]
input_ids = np.array([[start_token]], dtype="int64")
past_kvs = init_zero_past_kv()
inputs = [tvm.nd.array(input_ids), encoder_out]

decoder_prefill_vm = VirtualMachine(
    runtime.load_module("./onnx/decoder_model.so"), 
    tvm.cpu(), 
    profile=True
)

# Initialize empty KV (self + cross) for prefill decoder

# === Decoder profiling ===

# print("\\n=== Step 0 (Prefill) - Profiling ===")
# profile_report = decoder_prefill_vm.profile("main", *inputs).csv()
# insert_profile_report(profile_report)
# # Save CSV-formatted profiling report
# with open("./profile_data/decoder_prefill.csv", "w") as f:
#     f.write(profile_report)
# === End of Decoder profiling ===

# Get the actual output (without profiling)
start_time = datetime.now()
print("Start of decoder prefill:", start_time)
out = decoder_prefill_vm["main"](*inputs)
end_time = datetime.now()
print("End of decoder prefill:", end_time)
print("Decoder prefill takes: ", (end_time-start_time).total_seconds())

logits = out[0].numpy()
next_token = int(np.argmax(logits[0, -1]))

assert logits.ndim == 2 or logits.ndim == 3, "logits 維度不符"

tokens.append(next_token)
# print(f"⬆️ Next token: {next_token} ({tokenizer.decode([next_token])})")

# 將 decoder 回傳的 16 個 KV 擷取出來
decoder_kvs = list(out[1:])  # out[1]~out[16]

if next_token == eos_token:
    print("🛑 遇到 <eos>,結束解碼")
    transcript = tokenizer.decode(tokens, skip_special_tokens=True)
    print("\\n📝 Transcription:\\n", transcript)
    exit()

# === Decoder Step 1~N: step-by-step 解碼 ===

# === Decoder profiling ===
decoder_vm = VirtualMachine(
    runtime.load_module("./onnx/decoder_with_past_model.so"), 
    tvm.cpu(),
    profile=True  # Enable profiling
)
# === Decoder profiling ===

max_length = 64
all_reports = []  # Store all profiling reports

start_time = datetime.now()
print(f"Start of decoder token generation: {start_time}")

for step in range(1, max_length):
    # print(f"\\n=== Step {step} ===")
    input_ids = np.array([[tokens[-1]]], dtype="int64")
    inputs = [tvm.nd.array(input_ids)] + decoder_kvs

    # Profile every step (optional: skip warm-up steps)
    # === Decoder profiling ===

    # profile_report = decoder_vm.profile("main", *inputs).csv()
    # insert_profile_report(profile_report)

    # Save with step number in filename

    # with open(f"./profile_data/decoder_step_{step}.csv", "w") as f:
    #     f.write(profile_report)
    # === Decoder profiling ===

    
    # Normal execution

    out = decoder_vm["main"](*inputs)
    end_time = datetime.now()

    logits = out[0].numpy()
    next_token = int(np.argmax(logits[0, -1]))
    tokens.append(next_token)
    # print(f"⬆️ Next token: {next_token} ({tokenizer.decode([next_token])})")

    if next_token == eos_token:
        print("🛑 遇到 <eos>,結束解碼")
        break

    # Update self-attention positions (index 0,1,4,5,8,9,12,13)
    for i, dst_idx in enumerate([0,1,4,5,8,9,12,13]):
        decoder_kvs[dst_idx] = out[i + 1]

print(f"End of decoder token generation: {end_time}")
print(f"Decoder token generation takes: {(end_time-start_time).total_seconds()}")

# === 最後輸出結果 ===
transcript = tokenizer.decode(tokens, skip_special_tokens=True)
print("\\n📝 Transcription:\\n", transcript)

"""Write aggregated results to a CSV file."""
# === profiling data aggregation === 
# with open("./profile_data/aggregation.csv", "w") as f:
#     f.write("Name,Total Duration (us),Total Count\\n")
#     for name, (duration, count) in sorted(profile_agg.items(), key=lambda x: -x[1][0]):
#         f.write(f"{name},{duration},{count}\\n")
# === profiling data aggregation === 
end_time_all = datetime.now()
print("End of all:", end_time_all)
print("All takes: ", (end_time_all-start_time_all).total_seconds())

inference_profile.py

This is just above inference.py ’s profile code all un-commented.

import numpy as np
import tvm
from tvm import relax, runtime
from tvm.relax import VirtualMachine
from transformers import WhisperProcessor, WhisperTokenizer

from datetime import datetime
import csv
from collections import defaultdict
import numpy as np
import soundfile as sf
from scipy import signal

# Global aggregation dict: {Name: [total_duration_us, total_count]}
profile_agg = defaultdict(lambda: [0, 0])
start_time_all = datetime.now()
print("Start of all:", start_time_all)

# === 初始化空的 16 個 KV(給 prefill 和 step-by-step 共用)===
def init_zero_past_kv(num_layers=4, num_heads=6, head_dim=64,
                      decoder_seq_len=0, encoder_seq_len=1500, dtype="float32"):
    shape_decoder = (1, num_heads, decoder_seq_len, head_dim)
    shape_encoder = (1, num_heads, encoder_seq_len, head_dim)
    kvs = []
    for _ in range(num_layers):
        kvs += [
            tvm.nd.array(np.zeros(shape_decoder, dtype=dtype)),  # self.key
            tvm.nd.array(np.zeros(shape_decoder, dtype=dtype)),  # self.value
            tvm.nd.array(np.zeros(shape_encoder, dtype=dtype)),  # cross.key
            tvm.nd.array(np.zeros(shape_encoder, dtype=dtype))   # cross.value
        ]
    return kvs

def insert_profile_report(csv_str):
    """Insert one profile report (CSV string format) into the global aggregator."""
    reader = csv.DictReader(csv_str.strip().splitlines())
    for row in reader:
        name = row["Name"]
        duration = float(row["Duration (us)"])
        count = int(row["Count"])
        profile_agg[name][0] += duration
        profile_agg[name][1] += count

# === 載入模型與 tokenizer ===
processor = WhisperProcessor.from_pretrained("./")
tokenizer = WhisperTokenizer.from_pretrained("./")

# === 音訊轉 mel spectrogram ===

# === 1. Load audio ===
waveform, sr = sf.read("audio.wav")

# === 2. Resample to 16kHz if needed ===
target_sr = 16000
if sr != target_sr:
    num_samples = int(len(waveform) * target_sr / sr)
    waveform = signal.resample(waveform, num_samples)
    sr = target_sr

# === 3. Convert to mono ===
if waveform.ndim > 1:
    waveform = waveform.mean(axis=1)

# === 4. Pass into Hugging Face processor (same as torchaudio flow) ===
inputs = processor(waveform, sampling_rate=16000, return_tensors="np")

# === 5. Get float32 mel features ===
mel = inputs.input_features.astype("float32")

print("Mel shape:", mel.shape)

# === Encoder ===
encoder_vm = VirtualMachine(runtime.load_module("./onnx/encoder_model.so"), tvm.cpu(), profile=True)

# === Profile code block ===

# Profile the encoder execution
# profile_report = encoder_vm.profile("main", tvm.nd.array(mel)).csv()
# insert_profile_report(profile_report)
# Convert to CSV and save to file
# with open("./profile_data/encoder.csv", "w") as f:
#     f.write(profile_report)

# === End of Profile code block ===

start_time = datetime.now()
print("Start of encoder:", start_time)
encoder_out = encoder_vm["main"](tvm.nd.array(mel))  # shape: (1, 1500, 384)
end_time = datetime.now()
print("End of encoder:", end_time)
print("Encoder takes: ", (end_time-start_time).total_seconds())

# === Decoder Step 0: Prefill ===
# Initialize decoder VM with profiling enabled

start_token = 50258
eos_token = tokenizer.eos_token_id
tokens = [start_token]
input_ids = np.array([[start_token]], dtype="int64")
past_kvs = init_zero_past_kv()
inputs = [tvm.nd.array(input_ids), encoder_out]

decoder_prefill_vm = VirtualMachine(
    runtime.load_module("./onnx/decoder_model.so"), 
    tvm.cpu(), 
    profile=True
)

# Initialize empty KV (self + cross) for prefill decoder

# === Decoder profiling ===

# print("\\n=== Step 0 (Prefill) - Profiling ===")
# profile_report = decoder_prefill_vm.profile("main", *inputs).csv()
# insert_profile_report(profile_report)
# # Save CSV-formatted profiling report
# with open("./profile_data/decoder_prefill.csv", "w") as f:
#     f.write(profile_report)
# === End of Decoder profiling ===

# Get the actual output (without profiling)
start_time = datetime.now()
print("Start of decoder prefill:", start_time)
out = decoder_prefill_vm["main"](*inputs)
end_time = datetime.now()
print("End of decoder prefill:", end_time)
print("Decoder prefill takes: ", (end_time-start_time).total_seconds())

logits = out[0].numpy()
next_token = int(np.argmax(logits[0, -1]))

assert logits.ndim == 2 or logits.ndim == 3, "logits 維度不符"

tokens.append(next_token)
# print(f"⬆️ Next token: {next_token} ({tokenizer.decode([next_token])})")

# 將 decoder 回傳的 16 個 KV 擷取出來
decoder_kvs = list(out[1:])  # out[1]~out[16]

if next_token == eos_token:
    print("🛑 遇到 <eos>,結束解碼")
    transcript = tokenizer.decode(tokens, skip_special_tokens=True)
    print("\\n📝 Transcription:\\n", transcript)
    exit()

# === Decoder Step 1~N: step-by-step 解碼 ===

# === Decoder profiling ===
decoder_vm = VirtualMachine(
    runtime.load_module("./onnx/decoder_with_past_model.so"), 
    tvm.cpu(),
    profile=True  # Enable profiling
)
# === Decoder profiling ===

max_length = 64
all_reports = []  # Store all profiling reports

start_time = datetime.now()
print(f"Start of decoder token generation: {start_time}")

for step in range(1, max_length):
    # print(f"\\n=== Step {step} ===")
    input_ids = np.array([[tokens[-1]]], dtype="int64")
    inputs = [tvm.nd.array(input_ids)] + decoder_kvs

    # Profile every step (optional: skip warm-up steps)
    # === Decoder profiling ===

    # profile_report = decoder_vm.profile("main", *inputs).csv()
    # insert_profile_report(profile_report)

    # Save with step number in filename

    # with open(f"./profile_data/decoder_step_{step}.csv", "w") as f:
    #     f.write(profile_report)
    # === Decoder profiling ===

    
    # Normal execution

    out = decoder_vm["main"](*inputs)
    end_time = datetime.now()

    logits = out[0].numpy()
    next_token = int(np.argmax(logits[0, -1]))
    tokens.append(next_token)
    # print(f"⬆️ Next token: {next_token} ({tokenizer.decode([next_token])})")

    if next_token == eos_token:
        print("🛑 遇到 <eos>,結束解碼")
        break

    # Update self-attention positions (index 0,1,4,5,8,9,12,13)
    for i, dst_idx in enumerate([0,1,4,5,8,9,12,13]):
        decoder_kvs[dst_idx] = out[i + 1]

print(f"End of decoder token generation: {end_time}")
print(f"Decoder token generation takes: {(end_time-start_time).total_seconds()}")

# === 最後輸出結果 ===
transcript = tokenizer.decode(tokens, skip_special_tokens=True)
print("\\n📝 Transcription:\\n", transcript)

"""Write aggregated results to a CSV file."""
# === profiling data aggregation === 
# with open("./profile_data/aggregation.csv", "w") as f:
#     f.write("Name,Total Duration (us),Total Count\\n")
#     for name, (duration, count) in sorted(profile_agg.items(), key=lambda x: -x[1][0]):
#         f.write(f"{name},{duration},{count}\\n")
# === profiling data aggregation === 
end_time_all = datetime.now()
print("End of all:", end_time_all)
print("All takes: ", (end_time_all-start_time_all).total_seconds())

bananapi.cmake

# Licensed to the Apache Software Foundation (ASF) under one
# or more contributor license agreements.  See the NOTICE file
# distributed with this work for additional information
# regarding copyright ownership.  The ASF licenses this file
# to you under the Apache License, Version 2.0 (the
# "License"); you may not use this file except in compliance
# with the License.  You may obtain a copy of the License at
#
#   <http://www.apache.org/licenses/LICENSE-2.0>
#
# Unless required by applicable law or agreed to in writing,
# software distributed under the License is distributed on an
# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
# KIND, either express or implied.  See the License for the
# specific language governing permissions and limitations
# under the License.

# bananapi Codegen only. This can be enabled independently of USE_BANANAPI_RUNTIME to enable
# compilation of bananapi modules without requiring bananapi to be installed. The compiled modules
# will only be able to be executed using a TVM built with USE_BANANAPI_RUNTIME=ON.

include (FindPackageHandleStandardArgs)

if(USE_BANANAPI_CODEGEN)
    message(STATUS "Build with bananapi codegen")
    tvm_file_glob(GLOB COMPILER_BANANAPI_SRCS src/relax/backend/contrib/bananapi/*.cc)
    tvm_file_glob(GLOB RUNTIME_BANANAPI_SRCS src/runtime/contrib/bananapi/*.cc)
    list(APPEND COMPILER_SRCS ${COMPILER_BANANAPI_SRCS})
    if(NOT USE_BANANAPI_RUNTIME)
	    list(APPEND COMPILER_SRCS ${RUNTIME_BANANAPI_SRCS})
    endif()
endif()

# bananapi Runtime
if(USE_BANANAPI_RUNTIME)
    message(STATUS "Build with bananapi runtime")

    #find_path(BANANAPI_INCLUDE_DIR NvInfer.h HINTS ${BANANAPI_ROOT_DIR} PATH_SUFFIXES include)
    #find_library(BANANAPI_LIB_DIR nvinfer HINTS ${BANANAPI_ROOT_DIR} PATH_SUFFIXES lib)
    #include_directories(${BANANAPI_INCLUDE_DIR})
    #list(APPEND TVM_RUNTIME_LINKER_LIBS ${BANANAPI_LIB_DIR})

    # TRT runtime sources
    tvm_file_glob(GLOB RUNTIME_BANANAPI_SRCS src/runtime/contrib/bananapi/*.cc)
    list(APPEND RUNTIME_BANANAPI_SRCS src/runtime/contrib/bananapi/libmatmul.cpp)     # Add libmatmul.cpp explicitly

    list(APPEND RUNTIME_SRCS ${RUNTIME_BANANAPI_SRCS})

    # Set defines
    add_definitions(-DTVM_GRAPH_EXECUTOR_BANANAPI)
endif()

libmatmul.cpp

For native risc-v compiler

cd /home/fre930727/tvm/src/runtime/contrib/bananapi

g++ -std=c++11 -shared -fPIC \\
    -march=rv64gcv -mabi=lp64d \\
    -I ~/tvm/3rdparty/dlpack/include \\
    -o libmatmul.so libmatmul.cpp

For x86 compiler

cd /home/fre930727/tvm/src/runtime/contrib/bananapi

g++ -std=c++11 -shared -fPIC \\
    -I ~/tvm/3rdparty/dlpack/include \\
    -o libmatmul.so libmatmul.cpp

For x86 cross-compile risc-v (not sure if this is possible)

cd /home/fre930727/tvm/src/runtime/contrib/bananapi

riscv64-unknown-linux-gnu-g++ -std=c++11 -shared -fPIC \\
    -march=rv64gcv -mabi=lp64d \\
    -I ~/tvm/3rdparty/dlpack/include \\
    -o libmatmul.so libmatmul.cpp