From 66d6994dceb644aad95d26bc79945e9c2a18fc92 Mon Sep 17 00:00:00 2001 From: qinjun-li Date: Thu, 5 Sep 2024 14:21:28 +0800 Subject: [PATCH] [rtl] refactor mask unit. --- t1/src/Bundles.scala | 76 ++ t1/src/Lane.scala | 89 ++- t1/src/T1.scala | 961 +++--------------------- t1/src/decoder/Decoder.scala | 18 +- t1/src/decoder/attribute/isSwrite.scala | 2 - t1/src/laneStage/LaneStage3.scala | 54 +- t1/src/laneStage/MaskExchangeUnit.scala | 65 ++ t1/src/laneStage/SlotTokenManager.scala | 27 +- t1/src/mask/MaskCompress.scala | 123 +++ t1/src/mask/MaskExtend.scala | 69 ++ t1/src/mask/MaskReduce.scala | 186 +++++ t1/src/mask/MaskUnit.scala | 580 ++++++++++++++ t1/src/mask/MaskUnitReadCrossBar.scala | 48 ++ t1/src/package.scala | 26 + t1/src/sequencer/T1TokenManager.scala | 49 +- t1/src/vrf/VRF.scala | 11 +- 16 files changed, 1395 insertions(+), 989 deletions(-) create mode 100644 t1/src/laneStage/MaskExchangeUnit.scala create mode 100644 t1/src/mask/MaskCompress.scala create mode 100644 t1/src/mask/MaskExtend.scala create mode 100644 t1/src/mask/MaskReduce.scala create mode 100644 t1/src/mask/MaskUnit.scala create mode 100644 t1/src/mask/MaskUnitReadCrossBar.scala diff --git a/t1/src/Bundles.scala b/t1/src/Bundles.scala index fd833f07e..42652cb51 100644 --- a/t1/src/Bundles.scala +++ b/t1/src/Bundles.scala @@ -698,3 +698,79 @@ class T1Retire(xLen: Int) extends Bundle { val csr: ValidIO[T1CSRRetire] = Valid(new T1CSRRetire) val mem: ValidIO[EmptyBundle] = Valid(new EmptyBundle) } + +class MaskUnitExecuteState(parameter: T1Parameter) extends Bundle { + val groupReadState: UInt = UInt(parameter.laneNumber.W) + val needRead: UInt = UInt(parameter.laneNumber.W) + val elementValid: UInt = UInt(parameter.laneNumber.W) + val readOffset: UInt = UInt((parameter.laneNumber * parameter.laneParam.vrfOffsetBits).W) + val accessLane: Vec[UInt] = Vec(parameter.laneNumber, UInt(log2Ceil(parameter.laneNumber).W)) + // 3: log2Ceil(8); 8: Use up to 8 registers + val vsGrowth: Vec[UInt] = Vec(parameter.laneNumber, UInt(3.W)) + val groupCount: UInt = UInt(parameter.laneParam.groupNumberBits.W) + val executeIndex: UInt = UInt(2.W) + val readDataOffset: UInt = UInt((log2Ceil(parameter.datapathWidth / 8) * parameter.laneNumber).W) + val last: Bool = Bool() +} + +class MaskUnitInstReq(parameter: T1Parameter) extends Bundle { + val instructionIndex: UInt = UInt(parameter.instructionIndexBits.W) + val decodeResult: DecodeBundle = Decoder.bundle(parameter.decoderParam) + val readFromScala: UInt = UInt(parameter.datapathWidth.W) + val sew: UInt = UInt(2.W) + val vlmul: UInt = UInt(3.W) + val maskType: Bool = Bool() + val vxrm: UInt = UInt(3.W) + val vs2: UInt = UInt(5.W) + val vl: UInt = UInt(parameter.laneParam.vlMaxBits.W) +} + +class MaskUnitExeReq(parameter: LaneParameter) extends Bundle { + // source1, read vs + val source1: UInt = UInt(parameter.datapathWidth.W) + // source2, read offset + val source2: UInt = UInt(parameter.datapathWidth.W) + val groupCounter: UInt = UInt(parameter.groupNumberBits.W) + val index: UInt = UInt(parameter.instructionIndexBits.W) +} + +class MaskUnitExeResponse(parameter: LaneParameter) extends Bundle { + val ffoByOther: Bool = Bool() + val writeData = new MaskUnitWriteBundle(parameter) + val index: UInt = UInt(parameter.instructionIndexBits.W) +} + +class MaskUnitReadReq(parameter: T1Parameter) extends Bundle { + val vs: UInt = UInt(5.W) + // source2, read offset + val offset: UInt = UInt(parameter.laneParam.vrfOffsetBits.W) + // Read which lane + val readLane: UInt = UInt(log2Ceil(parameter.laneNumber).W) + // from which request + val requestIndex: UInt = UInt(log2Ceil(parameter.laneNumber).W) + // data position in data path + val dataOffset: UInt = UInt(log2Ceil(parameter.datapathWidth / 8).W) +} + +class MaskUnitReadQueue(parameter: T1Parameter) extends Bundle { + val vs: UInt = UInt(5.W) + // source2, read offset + val offset: UInt = UInt(parameter.laneParam.vrfOffsetBits.W) + // Which channel will this read request be written to? + val writeIndex: UInt = UInt(log2Ceil(parameter.laneNumber).W) + val dataOffset: UInt = UInt(log2Ceil(parameter.datapathWidth / 8).W) +} + +class MaskUnitWaitReadQueue(parameter: T1Parameter) extends Bundle { + val groupCounter: UInt = UInt(parameter.laneParam.groupNumberBits.W) + val executeIndex: UInt = UInt(2.W) + val sourceValid: UInt = UInt(parameter.laneNumber.W) + val needRead: UInt = UInt(parameter.laneNumber.W) + val last: Bool = Bool() +} + +class MaskUnitWriteBundle(parameter: LaneParameter) extends Bundle { + val data: UInt = UInt(parameter.datapathWidth.W) + val mask: UInt = UInt((parameter.datapathWidth / 8).W) + val groupCounter: UInt = UInt(parameter.groupNumberBits.W) +} diff --git a/t1/src/Lane.scala b/t1/src/Lane.scala index bb5f829e0..7a22b5cbf 100644 --- a/t1/src/Lane.scala +++ b/t1/src/Lane.scala @@ -234,13 +234,14 @@ class Lane(val parameter: LaneParameter) extends Module with SerializableModule[ @public val csrInterface: CSRInterface = IO(Input(new CSRInterface(parameter.vlMaxBits))) - /** response to [[T1.lsu]] or mask unit in [[T1]] */ @public - val laneResponse: ValidIO[LaneResponse] = IO(Valid(new LaneResponse(parameter))) + val maskUnitRequest: DecoupledIO[MaskUnitExeReq] = IO(Decoupled(new MaskUnitExeReq(parameter))) - /** feedback from [[T1]] to [[Lane]] for [[laneResponse]] */ @public - val laneResponseFeedback: ValidIO[LaneResponseFeedback] = IO(Flipped(Valid(new LaneResponseFeedback(parameter)))) + val maskRequestToLSU: Bool = IO(Output(Bool())) + + @public + val maskUnitResponse: ValidIO[MaskUnitExeResponse] = IO(Flipped(Valid(new MaskUnitExeResponse(parameter)))) /** for LSU and V accessing lane, this is not a part of ring, but a direct connection. */ @public @@ -570,14 +571,25 @@ class Lane(val parameter: LaneParameter) extends Module with SerializableModule[ slotCanShift(index) := true.B } - val laneState: LaneState = Wire(new LaneState(parameter)) - val stage0: Instance[LaneStage0] = Instantiate(new LaneStage0(parameter, isLastSlot)) - val stage1: Instance[LaneStage1] = Instantiate(new LaneStage1(parameter, isLastSlot)) - val stage2: Instance[LaneStage2] = Instantiate(new LaneStage2(parameter, isLastSlot)) - val executionUnit: Instance[LaneExecutionBridge] = Instantiate( + val laneState: LaneState = Wire(new LaneState(parameter)) + val stage0: Instance[LaneStage0] = Instantiate(new LaneStage0(parameter, isLastSlot)) + val stage1: Instance[LaneStage1] = Instantiate(new LaneStage1(parameter, isLastSlot)) + val stage2: Instance[LaneStage2] = Instantiate(new LaneStage2(parameter, isLastSlot)) + val executionUnit: Instance[LaneExecutionBridge] = Instantiate( new LaneExecutionBridge(parameter, isLastSlot, index) ) - val stage3: Instance[LaneStage3] = Instantiate(new LaneStage3(parameter, isLastSlot)) + val maskStage: Option[Instance[MaskExchangeUnit]] = + Option.when(isLastSlot)(Instantiate(new MaskExchangeUnit(parameter))) + val stage3: Instance[LaneStage3] = Instantiate(new LaneStage3(parameter, isLastSlot)) + val stage3EnqWire: DecoupledIO[LaneStage3Enqueue] = Wire(Decoupled(new LaneStage3Enqueue(parameter, isLastSlot))) + val stage3EnqSelect: DecoupledIO[LaneStage3Enqueue] = maskStage.map { mask => + mask.enqueue <> stage3EnqWire + maskUnitRequest <> mask.maskReq + maskRequestToLSU <> mask.maskRequestToLSU + mask.maskUnitResponse := maskUnitResponse + mask.dequeue + }.getOrElse(stage3EnqWire) + stage3.enqueue <> stage3EnqSelect // slot state laneState.vSew1H := vSew1H @@ -759,50 +771,47 @@ class Lane(val parameter: LaneParameter) extends Module with SerializableModule[ 0.U(parameter.chainingSize.W) ) AssertProperty(BoolSequence(!executionUnit.dequeue.valid || stage2.dequeue.valid)) - stage3.enqueue.valid := executionUnit.dequeue.valid - executionUnit.dequeue.ready := stage3.enqueue.ready + stage3EnqWire.valid := executionUnit.dequeue.valid + executionUnit.dequeue.ready := stage3EnqWire.ready stage2.dequeue.ready := executionUnit.dequeue.fire if (!isLastSlot) { - stage3.enqueue.bits := DontCare + stage3EnqWire.bits := DontCare } // pipe state from stage0 - stage3.enqueue.bits.decodeResult := stage2.dequeue.bits.decodeResult - stage3.enqueue.bits.instructionIndex := stage2.dequeue.bits.instructionIndex - stage3.enqueue.bits.loadStore := stage2.dequeue.bits.loadStore - stage3.enqueue.bits.vd := stage2.dequeue.bits.vd - stage3.enqueue.bits.ffoByOtherLanes := ffoRecord.ffoByOtherLanes - stage3.enqueue.bits.groupCounter := stage2.dequeue.bits.groupCounter - stage3.enqueue.bits.mask := stage2.dequeue.bits.mask + stage3EnqWire.bits.decodeResult := stage2.dequeue.bits.decodeResult + stage3EnqWire.bits.instructionIndex := stage2.dequeue.bits.instructionIndex + stage3EnqWire.bits.loadStore := stage2.dequeue.bits.loadStore + stage3EnqWire.bits.vd := stage2.dequeue.bits.vd + stage3EnqWire.bits.ffoByOtherLanes := ffoRecord.ffoByOtherLanes + stage3EnqWire.bits.groupCounter := stage2.dequeue.bits.groupCounter + stage3EnqWire.bits.mask := stage2.dequeue.bits.mask if (isLastSlot) { - stage3.enqueue.bits.sSendResponse := stage2.dequeue.bits.sSendResponse.get - stage3.enqueue.bits.ffoSuccess := executionUnit.dequeue.bits.ffoSuccess.get - stage3.enqueue.bits.fpReduceValid.zip(executionUnit.dequeue.bits.fpReduceValid).foreach { case (sink, source) => + stage3EnqWire.bits.sSendResponse := stage2.dequeue.bits.sSendResponse.get + stage3EnqWire.bits.ffoSuccess := executionUnit.dequeue.bits.ffoSuccess.get + stage3EnqWire.bits.fpReduceValid.zip(executionUnit.dequeue.bits.fpReduceValid).foreach { case (sink, source) => sink := source } } - stage3.enqueue.bits.data := executionUnit.dequeue.bits.data - stage3.enqueue.bits.pipeData := stage2.dequeue.bits.pipeData.getOrElse(DontCare) - stage3.enqueue.bits.ffoIndex := executionUnit.dequeue.bits.ffoIndex - executionUnit.dequeue.bits.crossWriteData.foreach(data => stage3.enqueue.bits.crossWriteData := data) - stage2.dequeue.bits.sSendResponse.foreach(_ => stage3.enqueue.bits.sSendResponse := _) - executionUnit.dequeue.bits.ffoSuccess.foreach(_ => stage3.enqueue.bits.ffoSuccess := _) + stage3EnqWire.bits.data := executionUnit.dequeue.bits.data + stage3EnqWire.bits.pipeData := stage2.dequeue.bits.pipeData.getOrElse(DontCare) + stage3EnqWire.bits.ffoIndex := executionUnit.dequeue.bits.ffoIndex + executionUnit.dequeue.bits.crossWriteData.foreach(data => stage3EnqWire.bits.crossWriteData := data) + stage2.dequeue.bits.sSendResponse.foreach(_ => stage3EnqWire.bits.sSendResponse := _) + executionUnit.dequeue.bits.ffoSuccess.foreach(_ => stage3EnqWire.bits.ffoSuccess := _) if (isLastSlot) { - when(laneResponseFeedback.valid) { - when(laneResponseFeedback.bits.complete) { + when(maskUnitResponse.valid) { + when(maskUnitResponse.bits.ffoByOther) { ffoRecord.ffoByOtherLanes := true.B } } - when(stage3.enqueue.fire) { + when(stage3EnqWire.fire) { executionUnit.dequeue.bits.ffoSuccess.foreach(ffoRecord.selfCompleted := _) // This group found means the next group ended early ffoRecord.ffoByOtherLanes := ffoRecord.ffoByOtherLanes || ffoRecord.selfCompleted } - - laneResponse <> stage3.laneResponse.get - stage3.laneResponseFeedback.get <> laneResponseFeedback } // --- stage 3 end & stage 4 start --- @@ -1176,10 +1185,10 @@ class Lane(val parameter: LaneParameter) extends Module with SerializableModule[ rpt.bits := allVrfWriteAfterCheck(parameter.chainingSize + 1 + rptIndex).instructionIndex } // todo: add mask unit write token - tokenManager.responseReport.valid := laneResponse.valid - tokenManager.responseReport.bits := laneResponse.bits.instructionIndex - tokenManager.responseFeedbackReport.valid := laneResponseFeedback.valid - tokenManager.responseFeedbackReport.bits := laneResponseFeedback.bits.instructionIndex + tokenManager.responseReport.valid := maskUnitRequest.valid + tokenManager.responseReport.bits := maskUnitRequest.bits.index + tokenManager.responseFeedbackReport.valid := maskUnitResponse.valid + tokenManager.responseFeedbackReport.bits := maskUnitResponse.bits.index val instInSlot: UInt = slotControl .zip(slotOccupied) .map { case (slotState, occupied) => @@ -1212,6 +1221,8 @@ class Lane(val parameter: LaneParameter) extends Module with SerializableModule[ tokenManager.topWriteDeq.valid := afterCheckDequeueFire(parameter.chainingSize) tokenManager.topWriteDeq.bits := allVrfWriteAfterCheck(parameter.chainingSize).instructionIndex + tokenManager.maskUnitLastReport := lsuLastReport + layer.block(layers.Verification) { val probeWire = Wire(new LaneProbe(parameter)) define(laneProbe, ProbeValue(probeWire)) diff --git a/t1/src/T1.scala b/t1/src/T1.scala index b87b3d043..bc118284b 100644 --- a/t1/src/T1.scala +++ b/t1/src/T1.scala @@ -392,8 +392,9 @@ class T1(val parameter: T1Parameter) /** the LSU Module */ - val lsu: Instance[LSU] = Instantiate(new LSU(parameter.lsuParameters)) - val decode: Instance[VectorDecoder] = Instantiate(new VectorDecoder(parameter.decoderParam)) + val lsu: Instance[LSU] = Instantiate(new LSU(parameter.lsuParameters)) + val decode: Instance[VectorDecoder] = Instantiate(new VectorDecoder(parameter.decoderParam)) + val maskUnit: Instance[MaskUnit] = Instantiate(new MaskUnit(parameter)) omInstance.decoderIn := Property(decode.om.asAnyClassType) val tokenManager: Instance[T1TokenManager] = Instantiate(new T1TokenManager(parameter)) @@ -519,17 +520,6 @@ class T1(val parameter: T1Parameter) /** which slot the instruction is entering */ val instructionToSlotOH: UInt = Wire(UInt(parameter.chainingSize.W)) - /** synchronize signal from each lane, for mask units.(ffo) */ - val laneSynchronize: Vec[Bool] = Wire(Vec(parameter.laneNumber, Bool())) - - /** all lanes are synchronized. */ - val synchronized: Bool = WireDefault(false.B) - - /** for mask unit that need to access VRF from lanes, use this signal to indicate it is finished access VRF(but - * instruction might not finish). - */ - val maskUnitReadOnlyFinish: Bool = WireDefault(false.B) - /** last slot is committing. */ val lastSlotCommit: Bool = Wire(Bool()) @@ -540,11 +530,6 @@ class T1(val parameter: T1Parameter) val vxsatReportVec: Vec[UInt] = Wire(Vec(parameter.laneNumber, UInt(parameter.chainingSize.W))) val vxsatReport = vxsatReportVec.reduce(_ | _) - // todo: 把lsu也放decode里去 - val maskUnitType: Bool = decodeResult(Decoder.maskUnit) && requestRegDequeue.bits.instruction(6) - val maskDestination = decodeResult(Decoder.maskDestination) - val unOrderType: Bool = decodeResult(Decoder.unOrderWrite) - /** Special instructions which will be allocate to the last slot. * - mask unit * - Lane <-> Top has data exchange(top might forward to LSU.) TODO: move to normal slots(add `offset` fields) @@ -555,99 +540,44 @@ class T1(val parameter: T1Parameter) val dataInWritePipeVec: Vec[UInt] = Wire(Vec(parameter.laneNumber, UInt(parameter.chainingSize.W))) val dataInWritePipe: UInt = dataInWritePipeVec.reduce(_ | _) - /** designed for unordered instruction(slide), it doesn't go to lane, it has RAW hazzard. - */ - val instructionRAWReady: Bool = Wire(Bool()) - val allSlotFree: Bool = Wire(Bool()) - val existMaskType: Bool = Wire(Bool()) - - // mask Unit 与lane交换数据 - val writeType: VRFWriteRequest = new VRFWriteRequest( - parameter.vrfParam.regNumBits, - parameter.vrfParam.vrfOffsetBits, - parameter.instructionIndexBits, - parameter.datapathWidth - ) - val maskUnitWrite: ValidIO[VRFWriteRequest] = Wire(Valid(writeType)) - val maskUnitWriteVec: Vec[ValidIO[VRFWriteRequest]] = Wire(Vec(3, Valid(writeType))) - val maskWriteLaneSelect: Vec[UInt] = Wire(Vec(3, UInt(parameter.laneNumber.W))) - // 默认是head - val maskUnitWriteSelect: UInt = Mux1H(maskUnitWriteVec.map(_.valid), maskWriteLaneSelect) - maskUnitWriteVec.foreach(_ := DontCare) - maskUnitWrite := Mux1H(maskUnitWriteVec.map(_.valid), maskUnitWriteVec) - val writeSelectMaskUnit: Vec[Bool] = Wire(Vec(parameter.laneNumber, Bool())) - val maskUnitWriteReady: Bool = writeSelectMaskUnit.asUInt.orR + // todo: instructionRAWReady -> v0 write token + val allSlotFree: Bool = Wire(Bool()) + val existMaskType: Bool = Wire(Bool()) // read - val readType: VRFReadRequest = new VRFReadRequest( + val readType: VRFReadRequest = new VRFReadRequest( parameter.vrfParam.regNumBits, parameter.vrfParam.vrfOffsetBits, parameter.instructionIndexBits ) - val maskUnitRead: ValidIO[VRFReadRequest] = Wire(Valid(readType)) - val maskUnitReadVec: Vec[ValidIO[VRFReadRequest]] = Wire(Vec(3, Valid(readType))) - val maskReadLaneSelect: Vec[UInt] = Wire(Vec(3, UInt(parameter.laneNumber.W))) - val maskUnitReadSelect: UInt = Mux1H(maskUnitReadVec.map(_.valid), maskReadLaneSelect) - maskUnitRead := Mux1H(maskUnitReadVec.map(_.valid), maskUnitReadVec) - val readSelectMaskUnit: Vec[Bool] = Wire(Vec(parameter.laneNumber, Bool())) - val maskUnitReadReady = readSelectMaskUnit.asUInt.orR - val laneReadResult: Vec[UInt] = Wire(Vec(parameter.laneNumber, UInt(parameter.datapathWidth.W))) - val WARRedResult: ValidIO[UInt] = RegInit(0.U.asTypeOf(Valid(UInt(parameter.datapathWidth.W)))) - // mask unit 最后的写 - val maskUnitFlushVrf: Bool = WireDefault(false.B) + // todo: ix type gather read // gather read state - val gatherOverlap: Bool = Wire(Bool()) - val gatherNeedRead: Bool = requestRegDequeue.valid && decodeResult(Decoder.gather) && + val gatherOverlap: Bool = Wire(Bool()) + val gatherNeedRead: Bool = requestRegDequeue.valid && decodeResult(Decoder.gather) && !decodeResult(Decoder.vtype) && !gatherOverlap + val gatherData: UInt = RegInit(0.U(parameter.datapathWidth.W)) + val gatherReadRequest: DecoupledIO[VRFReadRequest] = Wire(Decoupled(readType)) + val gatherReadLaneSelect: UInt = Wire(UInt(parameter.laneNumber.W)) + val gatherReadResultFire = Pipe(gatherReadRequest.fire, gatherReadLaneSelect, parameter.vrfReadLatency).valid val gatherReadFinish: Bool = RegEnable( !requestRegDequeue.fire, false.B, - (RegNext(RegNext(maskUnitReadReady)) && gatherNeedRead) || requestRegDequeue.fire + (gatherReadResultFire && gatherNeedRead) || requestRegDequeue.fire ) val gatherReadDataOffset: UInt = Wire(UInt(5.W)) - val gatherData: UInt = Mux(gatherOverlap, 0.U, (WARRedResult.bits >> gatherReadDataOffset).asUInt) - /** data that need to be compute at top. */ - val data: Vec[ValidIO[UInt]] = RegInit( - VecInit(Seq.fill(parameter.laneNumber)(0.U.asTypeOf(Valid(UInt(parameter.datapathWidth.W))))) - ) - val flotReduceValid: Seq[Option[Bool]] = Seq.tabulate(parameter.laneNumber) { _ => - Option.when(parameter.fpuEnable)(RegInit(false.B)) - } - val maskDataForCompress: UInt = RegInit(0.U(parameter.datapathWidth.W)) - // clear the previous set of data from lane - val dataClear: Bool = WireDefault(false.B) - val completedVec: Vec[Bool] = RegInit(VecInit(Seq.fill(parameter.laneNumber)(false.B))) - // ffoIndexReg.valid: Already found the first one - val ffoIndexReg: ValidIO[UInt] = RegInit(0.U.asTypeOf(Valid(UInt(parameter.xLen.W)))) - val ffoType: Bool = Wire(Bool()) + // todo + gatherReadRequest.valid := DontCare + gatherReadRequest.bits := DontCare + gatherReadRequest.ready := DontCare + gatherOverlap := DontCare + gatherReadLaneSelect := DontCare + gatherReadDataOffset := DontCare /** for find first one, need to tell the lane with higher index `1` . */ - val completedLeftOr: UInt = (scanLeftOr(completedVec.asUInt) << 1).asUInt(parameter.laneNumber - 1, 0) - // 按指定的sew拼成 {laneNumer * dataPathWidth} bit, 然后根据sew选择出来 - val sortedData: UInt = Mux1H( - vSewOHForMask, - Seq(4, 2, 1).map { groupSize => - VecInit(data.map { element => - element.bits.asBools // [x] * 32 eg: sew = 1 - .grouped(groupSize) // [x, x] * 16 - .toSeq - .map(VecInit(_).asUInt) // [xx] * 16 - }.transpose.map(VecInit(_).asUInt)).asUInt // [x*16] * 16 -> x * 256 - } - ) - // 把已经排过序的数据重新分给各个lane - val regroupData: Vec[UInt] = VecInit(Seq.tabulate(parameter.laneNumber) { laneIndex => - sortedData( - laneIndex * parameter.datapathWidth + parameter.datapathWidth - 1, - laneIndex * parameter.datapathWidth - ) - }) - val dataResult: ValidIO[UInt] = RegInit(0.U.asTypeOf(Valid(UInt(parameter.datapathWidth.W)))) - - val executeForLastLaneFire: Bool = WireDefault(false.B) + val dataResult: UInt = RegInit(0.U.asTypeOf(UInt(parameter.datapathWidth.W))) /** state machine register for each instruction. */ val slots: Seq[InstructionControl] = Seq.tabulate(parameter.chainingSize) { index => @@ -658,8 +588,6 @@ class T1(val parameter: T1Parameter) .asTypeOf(new InstructionControl(parameter.instructionIndexBits, parameter.laneNumber)) ) - val mvToVRF: Option[Bool] = Option.when(index == parameter.chainingSize - 1)(RegInit(false.B)) - /** the execution is finished. (but there might still exist some data in the ring.) */ val laneAndLSUFinish: Bool = control.endTag.asUInt.andR @@ -717,735 +645,22 @@ class T1(val parameter: T1Parameter) control.vxsat := true.B } } - // logic like mask&reduce will be put to last slot - // TODO: review later if (index == (parameter.chainingSize - 1)) { - val feedBack: UInt = RegInit(0.U(parameter.laneNumber.W)) - val executeCounter: UInt = RegInit(0.U((log2Ceil(parameter.laneNumber) + 1).W)) - // mask destination时这两count都是以写vrf为视角 - val writeBackCounter: UInt = RegInit(0.U(log2Ceil(parameter.laneNumber).W)) - val groupCounter: UInt = RegInit(0.U(parameter.groupNumberMaxBits.W)) - val iotaCount: UInt = RegInit(0.U((parameter.laneParam.vlMaxBits - 1).W)) - val maskTypeInstruction = RegInit(false.B) - val vd = RegInit(0.U(5.W)) - val vs1 = RegInit(0.U(5.W)) - val vs2 = RegInit(0.U(5.W)) - val rs1 = RegInit(0.U(parameter.xLen.W)) - val vm = RegInit(false.B) - val executeFinishReg = RegInit(true.B) - val unOrderTypeInstruction = RegInit(false.B) - val decodeResultReg = RegInit(0.U.asTypeOf(decodeResult)) - val gather: Bool = decodeResultReg(Decoder.gather) - // for slid - val elementIndexCount = RegInit(0.U(parameter.laneParam.vlMaxBits.W)) - val compressWriteCount = RegInit(0.U(parameter.laneParam.vlMaxBits.W)) - val nextElementIndex: UInt = elementIndexCount + 1.U - val firstElement = elementIndexCount === 0.U - val lastElement: Bool = nextElementIndex === csrRegForMaskUnit.vl - val updateMaskIndex = WireDefault(false.B) - when(updateMaskIndex) { elementIndexCount := nextElementIndex } - // 特殊的指令,会阻止 wLast 后把 sExecute 拉回来, 因为需要等待读后才写 - val mixedUnit: Bool = Wire(Bool()) - // slid & gather & extend - val slidUnitIdle: Bool = RegInit(true.B) - // compress & iota - val iotaUnitIdle: Bool = RegInit(true.B) - val orderedReduceGroupCount: Option[UInt] = Option.when(parameter.fpuEnable)( - RegInit(0.U(log2Ceil(parameter.vLen / parameter.laneNumber).W)) - ) - val orderedReduceIdle: Option[Bool] = Option.when(parameter.fpuEnable)(RegInit(true.B)) - val maskUnitIdle = (Seq(slidUnitIdle, iotaUnitIdle) ++ orderedReduceIdle).reduce(_ && _) - val reduce = decodeResultReg(Decoder.red) - val orderedReduce: Bool = if (parameter.fpuEnable) decodeResultReg(Decoder.orderReduce) else false.B - val popCount = decodeResultReg(Decoder.popCount) - val extend = decodeResultReg(Decoder.extend) - // first type instruction - val firstLane = ffo(completedVec.asUInt) - val firstLaneIndex: UInt = OHToUInt(firstLane)(log2Ceil(parameter.laneNumber) - 1, 0) - io.retire.rd.valid := lastSlotCommit && decodeResultReg(Decoder.targetRd) + val writeRD = RegInit(false.B) + val float: Option[Bool] = Option.when(parameter.fpuEnable)(RegInit(false.B)) + val vd = RegInit(0.U(5.W)) + when(instructionToSlotOH(index)) { + writeRD := decodeResult(Decoder.targetRd) + float.foreach(_ := decodeResult(Decoder.float)) + vd := requestRegDequeue.bits.instruction(11, 7) + } + io.retire.rd.valid := lastSlotCommit && writeRD io.retire.rd.bits.rdAddress := vd if (parameter.fpuEnable) { - io.retire.rd.bits.isFp := decodeResultReg(Decoder.float) + io.retire.rd.bits.isFp := float.getOrElse(false.B) } else { io.retire.rd.bits.isFp := false.B } - when(requestRegDequeue.fire) { - ffoIndexReg.valid := false.B - ffoIndexReg.bits := -1.S(parameter.xLen.W).asUInt - }.elsewhen(synchronized && completedVec.asUInt.orR && !ffoIndexReg.valid) { - ffoIndexReg.valid := true.B - ffoIndexReg.bits := Mux1H( - firstLane, - // 3: firstLaneIndex.width - data.map(i => i.bits(parameter.xLen - 1 - 3, 5) ## firstLaneIndex ## i.bits(4, 0)) - ) - } - ffoType := decodeResultReg(Decoder.ffo) - - /** vlmax = vLen * (2**lmul) / (2 ** sew * 8) \= (vLen / 8) * 2 ** (lmul - sew) \= vlb * 2 ** (lmul - sew) lmul <- - * (-3, -2, -1, 0 ,1, 2, 3) sew <- (0, 1, 2) lmul - sew <- [-5, 3] 选择信号 +5 -> lmul - sew + 5 <- [0, 8] - */ - def largeThanVLMax(source: UInt, advance: Bool = false.B, lmul: UInt, sew: UInt): Bool = { - val vlenLog2 = log2Ceil(parameter.vLen) // 10 - val cut = - if (source.getWidth >= vlenLog2) source(vlenLog2 - 1, vlenLog2 - 9) - else (0.U(vlenLog2.W) ## source)(vlenLog2 - 1, vlenLog2 - 9) - // 9: lmul - sew 的可能值的个数 - val largeList: Vec[Bool] = Wire(Vec(9, Bool())) - cut.asBools.reverse.zipWithIndex.foldLeft(advance) { case (a, (b, i)) => - largeList(i) := a - a || b - } - val extendVlmul = lmul(2) ## lmul - val selectWire = UIntToOH(5.U(4.W) + extendVlmul - sew)(8, 0).asBools.reverse - Mux1H(selectWire, largeList) - } - // 算req上面的分开吧 - val gatherWire = - Mux(decodeResult(Decoder.itype), requestRegDequeue.bits.instruction(19, 15), requestRegDequeue.bits.rs1Data) - val gatherAdvance = (gatherWire >> log2Ceil(parameter.vLen)).asUInt.orR - gatherOverlap := largeThanVLMax( - gatherWire, - gatherAdvance, - T1Issue.vlmul(requestReg.bits.issue), - T1Issue.vsew(requestReg.bits.issue) - ) - val slotValid = !control.state.idle - val storeAfterSlide = isStoreType && (requestRegDequeue.bits.instruction(11, 7) === vd) - instructionRAWReady := !((unOrderTypeInstruction && slotValid && - // slid 类的会比执行得慢的指令快(div),会修改前面的指令的source - ((vd === requestRegDequeue.bits.instruction(24, 20)) || - (vd === requestRegDequeue.bits.instruction(19, 15)) || - storeAfterSlide || - // slid 类的会比执行得快的指令慢(mv),会被后来的指令修改 source2 - (vs2 === requestRegDequeue.bits.instruction(11, 7))) || - (unOrderType && !allSlotFree) || - (requestReg.bits.vdIsV0 && existMaskType)) || - (vd === 0.U && maskType && slotValid)) - when(instructionToSlotOH(index)) { - writeBackCounter := 0.U - groupCounter := 0.U - executeCounter := 0.U - elementIndexCount := 0.U - compressWriteCount := 0.U - iotaCount := 0.U - slidUnitIdle := !((decodeResult(Decoder.slid) || (decodeResult(Decoder.gather) && decodeResult(Decoder.vtype)) - || decodeResult(Decoder.extend)) && instructionValid) - iotaUnitIdle := !((decodeResult(Decoder.compress) || decodeResult(Decoder.iota)) && instructionValid) - orderedReduceIdle.foreach(_ := !(decodeResult(Decoder.orderReduce) && instructionValid)) - orderedReduceGroupCount.foreach(_ := 0.U) - vd := requestRegDequeue.bits.instruction(11, 7) - vs1 := requestRegDequeue.bits.instruction(19, 15) - vs2 := requestRegDequeue.bits.instruction(24, 20) - vm := requestRegDequeue.bits.instruction(25) - executeFinishReg := false.B - rs1 := requestRegDequeue.bits.rs1Data - decodeResultReg := decodeResult - csrRegForMaskUnit := requestRegCSR - // todo: decode need execute - control.state.sMaskUnitExecution := !maskUnitType - maskTypeInstruction := maskType && !decodeResult(Decoder.maskSource) - completedVec.foreach(_ := false.B) - WARRedResult.valid := false.B - unOrderTypeInstruction := unOrderType - dataResult := 0.U.asTypeOf(dataResult) - }.elsewhen(control.state.wLast && maskUnitIdle) { - // 如果真需要执行的lane会wScheduler,不会提前发出last确认 - when(!mixedUnit) { - control.state.sMaskUnitExecution := true.B - } - maskUnitFlushVrf := !control.state.idle - } - when(laneSynchronize.asUInt.orR) { - feedBack := feedBack | laneSynchronize.asUInt - }.elsewhen(lastSlotCommit) { - feedBack := 0.U - } - // 执行 - // mask destination write - /** 对于mask destination 类型的指令需要特别注意两种不对齐 第一种是我们以 32(dataPatWidth) * 8(laneNumber) 为一个组, 但是我们vl可能不对齐一整个组 第二种是 - * 32(dataPatWidth) 的时候对不齐 vl假设最大1024,相应的会有11位的vl xxx xxx xxxxx - */ - val dataPathMisaligned = csrRegForMaskUnit.vl(parameter.dataPathWidthBits - 1, 0).orR - val groupMisaligned = - if (parameter.laneNumber > 1) - csrRegForMaskUnit - .vl(parameter.dataPathWidthBits + log2Ceil(parameter.laneNumber) - 1, parameter.dataPathWidthBits) - .orR - else false.B - - /** 我们需要计算最后一次写的 [[writeBackCounter]] & [[groupCounter]] lastGroupCounter = vl(10, 8) - !([[dataPathMisaligned]] - * \|| [[groupMisaligned]]) lastExecuteCounter = vl(7, 5) - ![[dataPathMisaligned]] - */ - val lastGroupCounter: UInt = - csrRegForMaskUnit.vl( - parameter.laneParam.vlMaxBits - 1, - parameter.dataPathWidthBits + log2Ceil(parameter.laneNumber) - ) - !(dataPathMisaligned || groupMisaligned) - val lastExecuteCounter: UInt = if (parameter.laneNumber > 1) { - csrRegForMaskUnit.vl( - parameter.dataPathWidthBits + log2Ceil(parameter.laneNumber) - 1, - parameter.dataPathWidthBits - ) - !dataPathMisaligned - } else 0.U - val lastGroup = groupCounter === lastGroupCounter - val lastExecute = lastGroup && writeBackCounter === lastExecuteCounter - val lastExecuteForGroup = writeBackCounter.andR - // 计算正写的这个lane是不是在边界上 - val endOH = UIntToOH(csrRegForMaskUnit.vl(parameter.dataPathWidthBits - 1, 0)) - val border = lastExecute && dataPathMisaligned && - !(decodeResultReg(Decoder.compress) || decodeResultReg(Decoder.gather)) - val lastGroupMask = scanRightOr(endOH(parameter.datapathWidth - 1, 1)) - val mvType = decodeResultReg(Decoder.mv) - val readMv = mvType && decodeResultReg(Decoder.targetRd) - val writeMv = mvType && !decodeResultReg(Decoder.targetRd) && - csrRegForMaskUnit.vl > csrRegForMaskUnit.vStart - mvToVRF.foreach(d => when(requestRegDequeue.fire) { d := writeMv }) - // 读后写中的读 - val needWAR = (maskTypeInstruction || border || reduce || readMv) && !popCount - val skipLaneData: Bool = decodeResultReg(Decoder.mv) - mixedUnit := writeMv || readMv - maskReadLaneSelect.head := UIntToOH(writeBackCounter) - maskReadLaneSelect.head := UIntToOH(writeBackCounter) - maskWriteLaneSelect.head := maskReadLaneSelect.head - maskUnitReadVec.head.valid := false.B - maskUnitReadVec.head.bits.vs := Mux(readMv, vs2, Mux(reduce, vs1, vd)) - maskUnitReadVec.head.bits.readSource := Mux(readMv, 1.U, Mux(reduce, 0.U, 2.U)) - maskUnitReadVec.head.bits.offset := groupCounter - maskUnitRead.bits.instructionIndex := control.record.instructionIndex - val readResultSelectResult = Mux1H( - Pipe(true.B, maskUnitReadSelect, parameter.vrfReadLatency).bits, - laneReadResult - ) - // 把mask选出来 - val maskSelect = v0(groupCounter ## writeBackCounter) - val fullMask: UInt = (-1.S(parameter.datapathWidth.W)).asUInt - - /** 正常全1 mask:[[maskSelect]] border: [[lastGroupMask]] mask && border: [[maskSelect]] & [[lastGroupMask]] - */ - val maskCorrect: UInt = Mux(maskTypeInstruction, maskSelect, fullMask) & - Mux(border, lastGroupMask, fullMask) - // mask - val sew1HCorrect = Mux(decodeResultReg(Decoder.widenReduce), vSewOHForMask ## false.B, vSewOHForMask) - // 写的data - val writeData = (WARRedResult.bits & (~maskCorrect).asUInt) | (regroupData(writeBackCounter) & maskCorrect) - val writeMask = Mux(sew1HCorrect(2) || !reduce, 15.U, Mux(sew1HCorrect(1), 3.U, 1.U)) - maskUnitWriteVec.head.valid := false.B - maskUnitWriteVec.head.bits.vd := vd - maskUnitWriteVec.head.bits.offset := groupCounter - maskUnitWriteVec.head.bits.data := Mux(writeMv, rs1, Mux(reduce, dataResult.bits, writeData)) - maskUnitWriteVec.head.bits.last := control.state.wLast || reduce - maskUnitWriteVec.head.bits.instructionIndex := control.record.instructionIndex - - val waitReadResult: Bool = Wire(Bool()) - val maskUnitReadVrf = maskUnitReadReady && maskUnitReadVec.map(_.valid).reduce(_ || _) && !waitReadResult - val readNext = RegNext(maskUnitReadVrf) - waitReadResult := RegNext(readNext) || readNext - when(Pipe(maskUnitReadVrf, false.B, parameter.vrfReadLatency).valid) { - WARRedResult.bits := readResultSelectResult - WARRedResult.valid := true.B - } - // alu start - val aluInput1 = Mux( - (Seq(executeCounter === 0.U) ++ orderedReduceGroupCount.map(_ === 0.U)).reduce(_ && _), - Mux( - needWAR, - WARRedResult.bits & FillInterleaved(8, writeMask), - 0.U - ), - dataResult.bits - ) - val aluInput2 = Mux1H(UIntToOH(executeCounter), data.map(d => Mux(d.valid, d.bits, 0.U))) - val skipFlotReduce: Bool = !Mux1H(UIntToOH(executeCounter), flotReduceValid.map(_.getOrElse(false.B))) - // red alu instance - val adder: Instance[ReduceAdder] = Instantiate(new ReduceAdder(parameter.datapathWidth)) - val logicUnit: Instance[LaneLogic] = Instantiate(new LaneLogic(parameter.datapathWidth)) - // option unit for flot reduce - val floatAdder: Option[Instance[FloatAdder]] = - Option.when(parameter.fpuEnable)(Instantiate(new FloatAdder(8, 24))) - val flotCompare: Option[Instance[FloatCompare]] = - Option.when(parameter.fpuEnable)(Instantiate(new FloatCompare(8, 24))) - - val sign = !decodeResultReg(Decoder.unsigned1) - adder.request.src := VecInit( - Seq( - (aluInput1(parameter.datapathWidth - 1) && sign) ## aluInput1, - (aluInput2(parameter.datapathWidth - 1) && sign) ## aluInput2 - ) - ) - // popCount 在top视为reduce add - adder.request.opcode := Mux(popCount, 0.U, decodeResultReg(Decoder.uop)) - adder.request.sign := sign - adder.request.vSew := Mux(popCount, 2.U, OHToUInt(sew1HCorrect)) - - floatAdder.foreach { fAdder => - fAdder.io.a := aluInput1 - fAdder.io.b := aluInput2 - fAdder.io.roundingMode := csrRegForMaskUnit.vxrm - } - - flotCompare.foreach { fCompare => - fCompare.io.a := aluInput1 - fCompare.io.b := aluInput2 - // max -> 12, min -> 8 - fCompare.io.isMax := decodeResultReg(Decoder.uop)(2) - } - - logicUnit.req.src := VecInit(Seq(aluInput1, aluInput2)) - logicUnit.req.opcode := decodeResultReg(Decoder.uop) - - // reduce resultSelect - val intReduceResult = Mux( - decodeResultReg(Decoder.adder) || popCount, - adder.response.data, - logicUnit.resp - ) - val flotReduceResult: Option[UInt] = Option.when(parameter.fpuEnable)( - Mux( - skipFlotReduce, - aluInput1, - Mux(decodeResultReg(Decoder.fpExecutionType) === 0.U, floatAdder.get.io.out, flotCompare.get.io.out) - ) - ) - val aluOutPut = Mux1H( - Seq(if (parameter.fpuEnable) reduce && !decodeResultReg(Decoder.float) else reduce) ++ - Option.when(parameter.fpuEnable)(reduce && decodeResultReg(Decoder.float)), - Seq(intReduceResult) ++ flotReduceResult - ) - // slid & gather unit - val slideUp = decodeResultReg(Decoder.topUop)(1) - val slide1 = decodeResultReg(Decoder.topUop)(0) && decodeResultReg(Decoder.slid) - - /** special uop 里面编码了extend的信息: specialUop(1,0): 倍率 specialUop(2):是否是符号 - */ - val extendSourceSew: Bool = (csrRegForMaskUnit.vSew >> decodeResultReg(Decoder.topUop)(1, 0))(0) - val extendSign: Bool = decodeResultReg(Decoder.topUop)(2) - // gather 相关的控制 - val gather16: Bool = decodeResultReg(Decoder.gather16) - val maskUnitEEW = Mux(gather16, 1.U, Mux(extend, extendSourceSew, csrRegForMaskUnit.vSew)) - val maskUnitEEW1H: UInt = UIntToOH(maskUnitEEW) - val maskUnitByteEnable = maskUnitEEW1H(2) ## maskUnitEEW1H(2) ## maskUnitEEW1H(2, 1).orR ## true.B - val maskUnitBitEnable = FillInterleaved(8, maskUnitByteEnable) - maskUnitWriteVec.head.bits.mask := Mux(writeMv, maskUnitByteEnable, writeMask) - // log2(dataWidth * laneNumber / 8) - val maskUnitDataOffset = - (elementIndexCount << maskUnitEEW).asUInt( - log2Ceil(parameter.datapathWidth * parameter.laneNumber / 8) - 1, - 0 - ) ## 0.U(3.W) - val maskUnitData = ((VecInit(data.map(_.bits)).asUInt >> maskUnitDataOffset).asUInt & maskUnitBitEnable)( - parameter.datapathWidth - 1, - 0 - ) - - val compareWire = Mux(decodeResultReg(Decoder.slid), rs1, maskUnitData) - val compareAdvance: Bool = (compareWire >> log2Ceil(parameter.vLen)).asUInt.orR - val compareResult: Bool = - largeThanVLMax(compareWire, compareAdvance, csrRegForMaskUnit.vlmul, csrRegForMaskUnit.vSew) - // 正在被gather使用的数据在data的那个组里 - val gatherDataSelect = - UIntToOH((false.B ## maskUnitDataOffset)(5 + (log2Ceil(parameter.laneNumber).max(1)) - 1, 5)) - val dataTail = Mux1H(UIntToOH(maskUnitEEW)(1, 0), Seq(3.U(2.W), 2.U(2.W))) - val lastElementForData = gatherDataSelect.asBools.last && maskUnitDataOffset(4, 3) === dataTail - val lastElementForCompressMask = elementIndexCount(log2Ceil(parameter.datapathWidth) - 1, 0).andR - val maskUnitDataReady: Bool = (gatherDataSelect & VecInit(data.map(_.valid)).asUInt).orR - // 正在被gather使用的数据是否就绪了 - val isSlide = !(gather || extend) - val slidUnitDataReady: Bool = maskUnitDataReady || isSlide - val compressDataReady = maskUnitDataReady || !(decodeResultReg(Decoder.compress) || decodeResultReg(Decoder.iota)) - // slid 先用状态机 - val idle :: sRead :: sWrite :: Nil = Enum(3) - val slideState = RegInit(idle) - val readState = slideState === sRead - - // slid 的立即数是0扩展的 - val slidSize = Mux(slide1, 1.U, Mux(decodeResultReg(Decoder.itype), vs1, rs1)) - // todo: 这里是否有更好的处理方式 - val slidSizeLSB = slidSize(parameter.laneParam.vlMaxBits - 1, 0) - // down + - // up - - val directionSelection = Mux(slideUp, (~slidSizeLSB).asUInt, slidSizeLSB) - val slideReadIndex = elementIndexCount + directionSelection + slideUp - val readIndex: UInt = Mux( - !maskUnitIdle, - Mux( - decodeResultReg(Decoder.slid), - slideReadIndex, - maskUnitData - ), - gatherWire - ) - - def indexAnalysis(elementIndex: UInt, csrInput: CSRInterface = csrRegForMaskUnit) = { - val sewInput = csrInput.vSew - val sewOHInput = UIntToOH(csrInput.vSew)(2, 0) - val intLMULInput: UInt = (1.U << csrInput.vlmul(1, 0)).asUInt - val dataPosition = (elementIndex(parameter.laneParam.vlMaxBits - 2, 0) << sewInput) - .asUInt(parameter.laneParam.vlMaxBits - 2, 0) - val accessMask = Mux1H( - sewOHInput(2, 0), - Seq( - UIntToOH(dataPosition(1, 0)), - FillInterleaved(2, UIntToOH(dataPosition(1))), - 15.U(4.W) - ) - ) - // 数据起始位置在32bit(暂时只32)中的偏移,由于数据会有跨lane的情况,融合的优化时再做 - val dataOffset = (dataPosition(1) && sewOHInput(1, 0).orR) ## (dataPosition(0) && sewOHInput(0)) ## 0.U(3.W) - val accessLane = if (parameter.laneNumber > 1) dataPosition(log2Ceil(parameter.laneNumber) + 1, 2) else 0.U(1.W) - // 32 bit / group - val dataGroup = (dataPosition >> (log2Ceil(parameter.laneNumber) + 2)).asUInt - val offsetWidth: Int = parameter.laneParam.vrfParam.vrfOffsetBits - val offset = dataGroup(offsetWidth - 1, 0) - val accessRegGrowth = (dataGroup >> offsetWidth).asUInt - val decimalProportion = offset ## accessLane - // 1/8 register - val decimal = decimalProportion(decimalProportion.getWidth - 1, 0.max(decimalProportion.getWidth - 3)) - - /** elementIndex 需要与vlMax比较, vLen * lmul /sew 这个计算太复杂了 我们可以换一个角度,计算读寄存器的增量与lmul比较,就能知道下标是否超vlMax了 vlmul - * 需要区分整数与浮点 - */ - val overlap = - (csrInput.vlmul(2) && decimal >= intLMULInput(3, 1)) || - (!csrInput.vlmul(2) && accessRegGrowth >= intLMULInput) - accessRegGrowth >= csrInput.vlmul - val reallyGrowth = accessRegGrowth(2, 0) - (accessMask, dataOffset, accessLane, offset, reallyGrowth, overlap) - } - val srcOverlap: Bool = !decodeResultReg(Decoder.itype) && (rs1 >= csrRegForMaskUnit.vl) - // rs1 >= vlMax - val srcOversize = !decodeResultReg(Decoder.itype) && !slide1 && compareResult - val signBit = Mux1H( - vSewOHForMask, - readIndex(parameter.laneParam.vlMaxBits - 1, parameter.laneParam.vlMaxBits - 3).asBools.reverse - ) - // 对于up来说小于offset的element是不变得的 - val slideUpUnderflow = slideUp && !slide1 && (signBit || srcOverlap) - val elementActive: Bool = v0.asUInt(elementIndexCount) || vm - val slidActive = elementActive && (!slideUpUnderflow || !decodeResultReg(Decoder.slid)) - // index >= vlMax 是写0 - val overlapVlMax: Bool = !slideUp && (signBit || srcOversize) - // select csr - val csrSelect = Mux(control.state.idle, requestRegCSR, csrRegForMaskUnit) - // slid read - val (_, readDataOffset, readLane, readOffset, readGrowth, lmulOverlap) = indexAnalysis(readIndex, csrSelect) - gatherReadDataOffset := readDataOffset - val readOverlap = lmulOverlap || overlapVlMax - val skipRead = readOverlap || (gather && compareResult) || extend - val maskUnitWriteVecFire1 = maskUnitReadVec(1).valid && maskUnitReadReady - val readFireNext1: Bool = RegNext(maskUnitWriteVecFire1) - val readFireNextNext1: Bool = RegNext(readFireNext1) - val port1WaitForResult: Bool = readFireNext1 || readFireNextNext1 - val gatherTryToRead = - gatherNeedRead && !VecInit(lsu.vrfReadDataPorts.map(_.valid)).asUInt.orR && !gatherReadFinish - maskUnitReadVec(1).valid := (readState || gatherTryToRead) && !port1WaitForResult - maskUnitReadVec(1).bits.vs := Mux(readState, vs2, requestRegDequeue.bits.instruction(24, 20)) + readGrowth - maskUnitReadVec(1).bits.readSource := 1.U - maskUnitReadVec(1).bits.offset := readOffset - maskReadLaneSelect(1) := UIntToOH(readLane) - // slid write, vlXXX: 用element index 算出来的 - val (vlMask, vlDataOffset, vlLane, vlOffset, vlGrowth, _) = indexAnalysis(elementIndexCount) - val writeState = slideState === sWrite - // 处理数据,先硬移位吧 - val slidReadData: UInt = ((WARRedResult.bits >> readDataOffset) << vlDataOffset) - .asUInt(parameter.datapathWidth - 1, 0) - val selectRS1 = slide1 && ((slideUp && firstElement) || (!slideUp && lastElement)) - // extend 类型的扩展和移位 - val extendData: UInt = (Mux( - extendSourceSew, - Fill(parameter.datapathWidth - 16, extendSign && maskUnitData(15)) ## maskUnitData(15, 0), - Fill(parameter.datapathWidth - 8, extendSign && maskUnitData(7)) ## maskUnitData(7, 0) - ) << vlDataOffset).asUInt(parameter.xLen - 1, 0) - - /** vd 的值有4种: - * 1. 用readIndex读出来的vs2的值 - * 1. 0 - * 1. slide1 时插进来的rs1 - * 1. extend 的值 - */ - val slidWriteData = Mux1H( - Seq((!(readOverlap || selectRS1 || extend)) || (gather && !compareResult), selectRS1, extend), - Seq(slidReadData, (rs1 << vlDataOffset).asUInt(parameter.xLen - 1, 0), extendData) - ) - maskUnitWriteVec(1).valid := writeState && slidActive - maskUnitWriteVec(1).bits.vd := vd + vlGrowth - maskUnitWriteVec(1).bits.offset := vlOffset - maskUnitWriteVec(1).bits.mask := vlMask - maskUnitWriteVec(1).bits.data := slidWriteData - maskUnitWriteVec(1).bits.last := lastElement - maskUnitWriteVec(1).bits.instructionIndex := control.record.instructionIndex - maskWriteLaneSelect(1) := UIntToOH(vlLane) - // slid 跳状态机 - when(slideState === idle) { - when((!slidUnitIdle) && slidUnitDataReady) { - when(skipRead) { - slideState := sWrite - }.otherwise { - slideState := sRead - } - } - } - when(readState) { - // 不需要valid,因为这个状态下一定是valid的 - when(readFireNextNext1) { - slideState := sWrite - } - } - when(writeState) { - when(maskUnitWriteReady || !slidActive) { - when(lastElement) { - slideState := idle - slidUnitIdle := true.B - when(gather || extend) { - synchronized := true.B - dataClear := true.B - maskUnitReadOnlyFinish := true.B - } - }.otherwise { - when(lastElementForData && (gather || extend)) { - synchronized := true.B - dataClear := true.B - slideState := idle - }.otherwise { - // todo: skip read - slideState := sRead - } - updateMaskIndex := true.B - } - } - } - - // compress & iota - val idle1 :: sReadMask :: sWrite1 :: Nil = Enum(3) - val compressState = RegInit(idle1) - val compressStateIdle = compressState === idle1 - val compressStateRead = compressState === sReadMask - val compressStateWrite = compressState === sWrite1 - - // compress 用vs1当mask,需要先读vs1 - val readCompressMaskNext = Pipe(maskUnitReadReady && compressStateRead, false.B, parameter.vrfReadLatency).valid - when(readCompressMaskNext) { - maskDataForCompress := readResultSelectResult - } - - // 处理 iota - val iotaDataOffset: UInt = elementIndexCount(log2Ceil(parameter.datapathWidth * parameter.laneNumber) - 1, 0) - val lastDataForIota: Bool = iotaDataOffset.andR - val iotaData = VecInit(data.map(_.bits)).asUInt(iotaDataOffset) - val iota = decodeResultReg(Decoder.iota) - - val maskUnitReadFire2: Bool = maskUnitReadVec(2).valid && maskUnitReadReady - val readFireNext2 = RegNext(maskUnitReadFire2) - val readFireNextNext2 = RegNext(readFireNext2) - val port2WaitForResult = readFireNextNext2 || readFireNext2 - - /** 计算需要读的mask的相关 elementIndexCount -> 11bit 只会访问单寄存器 elementIndexCount(4, 0)做为32bit内的offset elementIndexCount(7, - * 5)作为lane的选择 elementIndexCount(9, 8)作为offset - */ - // compress read - maskUnitReadVec(2).valid := compressStateRead && !port2WaitForResult - maskUnitReadVec(2).bits.vs := vs1 - maskUnitReadVec(2).bits.readSource := 0.U - maskUnitReadVec(2).bits.offset := elementIndexCount( - log2Ceil(parameter.datapathWidth) + log2Ceil(parameter.laneNumber) + - parameter.laneParam.vrfParam.vrfOffsetBits - 1, - log2Ceil(parameter.datapathWidth) + log2Ceil(parameter.laneNumber) - ) - maskReadLaneSelect(2) := UIntToOH( - elementIndexCount( - log2Ceil(parameter.datapathWidth) + ((log2Ceil(parameter.laneNumber) - 1).max(0)), - log2Ceil(parameter.datapathWidth) - ) - ) - // val lastElementForMask: Bool = elementIndexCount(4, 0).andR - val maskForCompress: Bool = maskDataForCompress(elementIndexCount(log2Ceil(parameter.datapathWidth) - 1, 0)) - - // compress vm=0 是保留的 - val skipWrite = !Mux(decodeResultReg(Decoder.compress), maskForCompress, elementActive) - val dataGroupTailForCompressUnit: Bool = Mux(iota, lastDataForIota, lastElementForData) - - // 计算compress write的位置信息 - val (compressMask, compressDataOffset, compressLane, compressOffset, compressGrowth, _) = - indexAnalysis(compressWriteCount) - val compressWriteData = (maskUnitData << compressDataOffset).asUInt - val iotaWriteData = (iotaCount << vlDataOffset).asUInt - // compress write - maskUnitWriteVec(2).valid := compressStateWrite && !skipWrite - maskUnitWriteVec(2).bits.vd := vd + Mux(iota, vlGrowth, compressGrowth) - maskUnitWriteVec(2).bits.offset := Mux(iota, vlOffset, compressOffset) - maskUnitWriteVec(2).bits.mask := Mux(iota, vlMask, compressMask) - maskUnitWriteVec(2).bits.data := Mux(iota, iotaWriteData, compressWriteData) - maskUnitWriteVec(2).bits.last := lastElement - maskUnitWriteVec(2).bits.instructionIndex := control.record.instructionIndex - maskWriteLaneSelect(2) := UIntToOH(Mux(iota, vlLane, compressLane)) - - // 跳状态机 - // compress每组数据先读mask - val firstState = Mux(iota, sWrite1, sReadMask) - when(compressStateIdle && (!iotaUnitIdle) && compressDataReady) { - compressState := firstState - } - - when(compressStateRead && readFireNextNext2) { - compressState := sWrite1 - } - - when(compressStateWrite) { - when(maskUnitWriteReady || skipWrite) { - when(!skipWrite) { - compressWriteCount := compressWriteCount + 1.U - iotaCount := iotaCount + iotaData - } - when(lastElement) { - compressState := idle - iotaUnitIdle := true.B - synchronized := true.B - dataClear := true.B - maskUnitReadOnlyFinish := true.B - }.otherwise { - when(lastElementForCompressMask) { - // update vs1 as mask for compress - compressState := sRead - } - when(dataGroupTailForCompressUnit) { - synchronized := true.B - dataClear := true.B - compressState := idle - } - updateMaskIndex := true.B - } - } - } - // for small vl & reduce - val accessByte = (csrRegForMaskUnit.vl << csrRegForMaskUnit.vSew).asUInt - // vl < row(vl) - val smallVL = accessByte < (parameter.datapathWidth * parameter.laneNumber / 8).U - val byteSizePerDataPathBits = log2Ceil(parameter.datapathWidth / 8) - val lastExecuteCounterForReduce: UInt = if (parameter.laneNumber > 1) { - accessByte( - byteSizePerDataPathBits + log2Ceil(parameter.laneNumber) - 1, - byteSizePerDataPathBits - ) - !accessByte(byteSizePerDataPathBits - 1, 0).orR - } else 0.U - val lastGroupDataWaitMaskForRed: UInt = scanRightOr(UIntToOH(lastExecuteCounterForReduce)) - // alu end - val maskOperation = - decodeResultReg(Decoder.maskLogic) || - decodeResultReg(Decoder.maskDestination) || - decodeResultReg(Decoder.ffo) - // How many data path(32 bit) will used by maskDestination instruction. - val maskDestinationByteSize: Bits = - csrRegForMaskUnit.vl(log2Ceil(parameter.dLen) - 1, 0) << csrRegForMaskUnit.vSew - val maskDestinationUseDataPathSize = - (maskDestinationByteSize >> 2).asUInt + maskDestinationByteSize(1, 0).orR - val lastGroupCountForThisGroup: UInt = maskDestinationUseDataPathSize(log2Ceil(parameter.laneNumber) - 1, 0) - val counterForMaskDestination: UInt = if (parameter.laneNumber > 1) { - (lastGroupCountForThisGroup - 1.U) | - Fill( - log2Ceil(parameter.laneNumber), - (maskDestinationUseDataPathSize >> log2Ceil(parameter.laneNumber)).asUInt.orR - ) - } else 0.U - - val waitSourceDataCounter = - Mux(decodeResultReg(Decoder.maskDestination), counterForMaskDestination, lastExecuteCounter) - val lastGroupDataWaitMask = scanRightOr(UIntToOH(waitSourceDataCounter)) - // todo: other ways - val lastOrderedGroup: Option[Bool] = orderedReduceGroupCount.map(count => - (count ## 0 - .U(log2Ceil(parameter.laneNumber).W) + -1.S(log2Ceil(parameter.laneNumber).W).asUInt) >= csrRegForMaskUnit.vl - ) - val misalignedOrdered: Bool = if (parameter.fpuEnable) { - lastOrderedGroup.get && csrRegForMaskUnit.vl(log2Ceil(parameter.laneNumber) - 1, 0).orR && decodeResultReg( - Decoder.float - ) - } else false.B - val dataMask = - Mux( - maskOperation && lastGroup, - lastGroupDataWaitMask, - Mux( - reduce && (smallVL || misalignedOrdered), - lastGroupDataWaitMaskForRed, - -1.S(parameter.laneNumber.W).asUInt - ) - ) - val dataReady = ((~dataMask).asUInt | VecInit(data.map(_.valid)).asUInt).andR || skipLaneData - when( - // data ready - dataReady && - // state check - !control.state.sMaskUnitExecution - ) { - // 读 - when(needWAR && !WARRedResult.valid) { - maskUnitReadVec.head.valid := true.B - } - // 可能有的计算 - val nextExecuteIndex: UInt = executeCounter + 1.U - val isLastExecuteForGroup: Bool = executeCounter(log2Ceil(parameter.laneNumber) - 1, 0).andR - val lastExecuteForInstruction: Option[Bool] = orderedReduceGroupCount.map(count => - (count ## 0.U(log2Ceil(parameter.laneNumber).W) + nextExecuteIndex) === csrRegForMaskUnit.vl - ) - val readFinish = WARRedResult.valid || !needWAR - val readDataSign = - Mux1H(vSewOHForMask(2, 0), Seq(WARRedResult.bits(7), WARRedResult.bits(15), WARRedResult.bits(31))) - when(readFinish && !executeFinishReg) { - when(readMv) { - control.state.sMaskUnitExecution := true.B - // signExtend for vmv.x.s - dataResult.bits := Mux(vSewOHForMask(2), WARRedResult.bits(31, 16), Fill(16, readDataSign)) ## - Mux(vSewOHForMask(0), Fill(8, readDataSign), WARRedResult.bits(15, 8)) ## - WARRedResult.bits(7, 0) - - }.otherwise { - executeCounter := nextExecuteIndex - when(executeCounter =/= csrRegForMaskUnit.vl) { - dataResult.bits := aluOutPut - } - if (parameter.fpuEnable) { - when(!orderedReduceIdle.get) { - when(lastExecuteForInstruction.get) { - orderedReduceIdle.get := true.B - }.elsewhen(isLastExecuteForGroup) { - synchronized := true.B - executeCounter := 0.U - dataClear := true.B - orderedReduceGroupCount.foreach(d => d := d + 1.U) - } - } - } - } - } - // for vfredmax - val lastReduceCounter = - executeCounter === csrRegForMaskUnit.vl || executeCounter(log2Ceil(parameter.laneNumber)) - dontTouch(lastReduceCounter) - val executeFinish: Bool = - (lastReduceCounter || !(reduce || popCount) || orderedReduce) && maskUnitIdle - val schedulerWrite = decodeResultReg(Decoder.maskDestination) || (reduce && !popCount) || writeMv - val groupSync = decodeResultReg(Decoder.ffo) - // 写回 - when(readFinish && (executeFinish || writeMv || executeFinishReg)) { - maskUnitWriteVec.head.valid := schedulerWrite - executeFinishReg := true.B - when(maskUnitWriteReady || !schedulerWrite) { - WARRedResult.valid := false.B - writeBackCounter := writeBackCounter + schedulerWrite - when(lastExecuteForGroup || lastExecute || reduce || groupSync || writeMv || popCount) { - synchronized := true.B - dataClear := true.B - when(lastExecuteForGroup || groupSync) { - executeForLastLaneFire := true.B - groupCounter := groupCounter + 1.U - } - when(lastExecute || reduce || writeMv || popCount) { - control.state.sMaskUnitExecution := true.B - } - } - } - } - } } control } @@ -1552,44 +767,26 @@ class T1(val parameter: T1Parameter) lane.csrInterface.vl := evlForLane lane.laneIndex := index.U - // - LSU request next offset of group - // - all lane are synchronized - // - the index type of instruction is finished. - lane.laneResponseFeedback.valid := lsu.lsuOffsetRequest || synchronized || completeIndexInstruction - // - the index type of instruction is finished. - // - for find first one. - lane.laneResponseFeedback.bits.complete := - completeIndexInstruction || - completedLeftOr(index) || - maskUnitReadOnlyFinish - // tell lane which - lane.laneResponseFeedback.bits.instructionIndex := slots.last.record.instructionIndex - // lsu 优先会有死锁: // vmadc, v1, v2, 1 (vl=17) -> 需要先读后写 // vse32.v v1, (a0) -> 依赖上一条,但是会先发出read // 读 lane - lane.vrfReadAddressChannel.valid := lsu.vrfReadDataPorts(index).valid || - (maskUnitRead.valid && maskUnitReadSelect(index)) + lane.vrfReadAddressChannel.valid := lsu.vrfReadDataPorts(index).valid || maskUnit.readChannel(index).valid lane.vrfReadAddressChannel.bits := - Mux(maskUnitRead.valid, maskUnitRead.bits, lsu.vrfReadDataPorts(index).bits) - lsu.vrfReadDataPorts(index).ready := lane.vrfReadAddressChannel.ready && !maskUnitRead.valid - readSelectMaskUnit(index) := - lane.vrfReadAddressChannel.ready && maskUnitReadSelect(index) - laneReadResult(index) := lane.vrfReadDataChannel + Mux(maskUnit.readChannel(index).valid, maskUnit.readChannel(index).bits, lsu.vrfReadDataPorts(index).bits) + lsu.vrfReadDataPorts(index).ready := lane.vrfReadAddressChannel.ready && !maskUnit.readChannel(index).valid + maskUnit.readChannel(index).ready := lane.vrfReadAddressChannel.ready + maskUnit.readResult(index) := lane.vrfReadDataChannel lsu.vrfReadResults(index) := lane.vrfReadDataChannel - // 写lane - lane.vrfWriteChannel.valid := vrfWrite(index).valid || (maskUnitWrite.valid && maskUnitWriteSelect(index)) - lane.vrfWriteChannel.bits := - Mux(vrfWrite(index).valid, vrfWrite(index).bits, maskUnitWrite.bits) + // lsu & mask unit write lane + lane.vrfWriteChannel.valid := vrfWrite(index).valid + lane.vrfWriteChannel.bits := vrfWrite(index).bits vrfWrite(index).ready := lane.vrfWriteChannel.ready - writeSelectMaskUnit(index) := - lane.vrfWriteChannel.ready && !vrfWrite(index).valid && maskUnitWriteSelect(index) - lsu.offsetReadResult(index).valid := lane.laneResponse.valid && lane.laneResponse.bits.toLSU - lsu.offsetReadResult(index).bits := lane.laneResponse.bits.data - lsu.offsetReadIndex(index) := lane.laneResponse.bits.instructionIndex + lsu.offsetReadResult(index).valid := lane.maskUnitRequest.valid && lane.maskRequestToLSU + lsu.offsetReadResult(index).bits := lane.maskUnitRequest.bits.source2 + lsu.offsetReadIndex(index) := lane.maskUnitRequest.bits.index instructionFinished(index).zip(slots.map(_.record.instructionIndex)).foreach { case (d, f) => d := (UIntToOH(f(parameter.instructionIndexBits - 2, 0)) & lane.instructionFinished).orR @@ -1598,12 +795,7 @@ class T1(val parameter: T1Parameter) val v0ForThisLane: Seq[UInt] = regroupV0.map(rv => cutUInt(rv, parameter.vLen / parameter.laneNumber)(index)) val v0SelectBySew = Mux1H(UIntToOH(lane.maskSelectSew)(2, 0), v0ForThisLane) lane.maskInput := cutUInt(v0SelectBySew, parameter.datapathWidth)(lane.maskSelect) - lane.lsuLastReport := lsu.lastReport | - Mux( - maskUnitFlushVrf, - indexToOH(slots.last.record.instructionIndex, parameter.chainingSize), - 0.U - ) + lane.lsuLastReport := lsu.lastReport | maskUnit.lastReport lane.lsuMaskGroupChange := lsu.lsuMaskGroupChange lane.loadDataInLSUWriteQueue := lsu.dataInWriteQueue(index) @@ -1613,18 +805,7 @@ class T1(val parameter: T1Parameter) (requestReg.bits.writeByte >> rowWith).asUInt + (requestReg.bits.writeByte(rowWith - 1, 0) > ((parameter.datapathWidth / 8) * index).U) - // 处理lane的mask类型请求 - laneSynchronize(index) := lane.laneResponse.valid && !lane.laneResponse.bits.toLSU - when(laneSynchronize(index)) { - data(index).valid := true.B - data(index).bits := lane.laneResponse.bits.data - completedVec(index) := lane.laneResponse.bits.ffoSuccess - flotReduceValid(index).foreach(d => d := lane.laneResponse.bits.fpReduceValid.get) - } - // token manager - tokenManager.writeV0(index).valid := lane.vrfWriteChannel.fire && (lane.vrfWriteChannel.bits.vd === 0.U) - tokenManager.writeV0(index).bits := lane.vrfWriteChannel.bits.instructionIndex tokenManager.instructionFinish(index) := lane.instructionFinished lane @@ -1656,6 +837,35 @@ class T1(val parameter: T1Parameter) lsu.writeReadyForLsu := VecInit(laneVec.map(_.writeReadyForLsu)).asUInt.andR lsu.vrfReadyToStore := VecInit(laneVec.map(_.vrfReadyToStore)).asUInt.andR + // connect mask unit + maskUnit.instReq.valid := requestRegDequeue.fire && requestReg.bits.decodeResult(Decoder.maskUnit) + maskUnit.instReq.bits.instructionIndex := requestReg.bits.instructionIndex + maskUnit.instReq.bits.decodeResult := decodeResult + maskUnit.instReq.bits.readFromScala := source1Select + maskUnit.instReq.bits.sew := T1Issue.vsew(requestReg.bits.issue) + maskUnit.instReq.bits.maskType := maskType + maskUnit.instReq.bits.vxrm := requestReg.bits.issue.vcsr(2, 1) + maskUnit.instReq.bits.vlmul := requestReg.bits.issue.vtype(2, 0) + maskUnit.instReq.bits.vs2 := requestRegDequeue.bits.instruction(24, 20) + maskUnit.instReq.bits.vl := requestReg.bits.issue.vl + maskUnit.maskInput := cutUInt(v0.asUInt, parameter.maskGroupWidth)(maskUnit.maskSelect) + + maskUnit.exeReq.zip(laneVec).foreach { case (maskInput, lane) => + maskInput <> lane.maskUnitRequest + } + maskUnit.exeResp.zip(laneVec).foreach { case (maskOutput, lane) => + lane.maskUnitResponse <> maskOutput + } + + val gatherResultSelect: UInt = Mux1H( + gatherReadLaneSelect, + laneVec.map(_.vrfReadDataChannel) + ) + // gather read result + when(gatherReadResultFire) { + gatherData := Mux(gatherOverlap, 0.U, (gatherResultSelect >> gatherReadDataOffset).asUInt) + } + // 连lane的环 parameter.crossLaneConnectCycles.zipWithIndex.foreach { case (cycles, index) => cycles.zipWithIndex.foreach { case (cycle, portIndex) => @@ -1721,10 +931,17 @@ class T1(val parameter: T1Parameter) // we detect the hazard and decide should we issue this slide or // issue the instruction after the slide which already in the slot. requestRegDequeue.ready := executionReady && slotReady && (!gatherNeedRead || gatherReadFinish) && - instructionRAWReady && instructionIndexFree && vrfAllocate + tokenManager.issueAllow && instructionIndexFree && vrfAllocate instructionToSlotOH := Mux(requestRegDequeue.fire, slotToEnqueue, 0.U) + tokenManager.instructionIssue.valid := requestRegDequeue.fire + tokenManager.instructionIssue.bits.instructionIndex := requestReg.bits.instructionIndex + tokenManager.instructionIssue.bits.writeV0 := + (!requestReg.bits.decodeResult(Decoder.targetRd) && !isStoreType) && requestReg.bits.vdIsV0 + tokenManager.instructionIssue.bits.useV0AsMask := maskType + tokenManager.instructionIssue.bits.isLoadStore := !requestRegDequeue.bits.instruction(6) + // instruction commit { val slotCommit: Vec[Bool] = VecInit(slots.map { inst => @@ -1740,7 +957,7 @@ class T1(val parameter: T1Parameter) inst.record.instructionIndex === responseCounter }) retire := slotCommit.asUInt.orR - io.retire.rd.bits.rdData := Mux(ffoType, ffoIndexReg.bits, dataResult.bits) + io.retire.rd.bits.rdData := dataResult // TODO: csr retire. io.retire.csr.bits.vxsat := (slotCommit.asUInt & VecInit(slots.map(_.vxsat)).asUInt).orR io.retire.csr.bits.fflag := DontCare @@ -1762,11 +979,6 @@ class T1(val parameter: T1Parameter) data := (data & (~maskExt).asUInt) | (maskExt & v0Write.bits.data) } } - when(dataClear) { - data.foreach(_.valid := false.B) - } - // don't care有可能会导致先读后写失败 - maskUnitReadVec.foreach(_.bits.instructionIndex := slots.last.record.instructionIndex) layer.block(layers.Verification) { @@ -1779,9 +991,10 @@ class T1(val parameter: T1Parameter) probeWire.issueTag := requestReg.bits.instructionIndex probeWire.retireValid := retire // maskUnitWrite maskUnitWriteReady - probeWire.writeQueueEnq.valid := maskUnitWrite.valid && maskUnitWriteReady - probeWire.writeQueueEnq.bits := maskUnitWrite.bits.instructionIndex - probeWire.writeQueueEnqMask := maskUnitWrite.bits.mask + // todo + probeWire.writeQueueEnq.valid := DontCare + probeWire.writeQueueEnq.bits := DontCare + probeWire.writeQueueEnqMask := DontCare probeWire.instructionValid := maskAnd( !slots.last.state.sMaskUnitExecution && !slots.last.state.idle, indexToOH(slots.last.record.instructionIndex, parameter.chainingSize * 2) diff --git a/t1/src/decoder/Decoder.scala b/t1/src/decoder/Decoder.scala index 4072ae589..74b08d520 100644 --- a/t1/src/decoder/Decoder.scala +++ b/t1/src/decoder/Decoder.scala @@ -33,7 +33,7 @@ trait T1UopField extends T1DecodeFiled[UInt] with FieldName { } trait T1TopUopField extends T1DecodeFiled[UInt] with FieldName { - def chiselType: UInt = UInt(3.W) + def chiselType: UInt = UInt(5.W) } trait T1fpExecutionTypeUopField extends T1DecodeFiled[UInt] with FieldName { @@ -227,14 +227,14 @@ object Decoder { object topUop extends T1TopUopField { override def genTable(pattern: T1DecodePattern): BitPat = pattern.topUop.value match { - case _: TopT0.type => BitPat("b000") - case _: TopT1.type => BitPat("b001") - case _: TopT2.type => BitPat("b010") - case _: TopT3.type => BitPat("b011") - case _: TopT5.type => BitPat("b101") - case _: TopT6.type => BitPat("b110") - case _: TopT7.type => BitPat("b111") - case _ => BitPat.dontCare(3) + case _: TopT0.type => BitPat("b00000") + case _: TopT1.type => BitPat("b00001") + case _: TopT2.type => BitPat("b00010") + case _: TopT3.type => BitPat("b00011") + case _: TopT5.type => BitPat("b00101") + case _: TopT6.type => BitPat("b00110") + case _: TopT7.type => BitPat("b00111") + case _ => BitPat.dontCare(5) } } diff --git a/t1/src/decoder/attribute/isSwrite.scala b/t1/src/decoder/attribute/isSwrite.scala index f14bad4c0..9b33abe53 100644 --- a/t1/src/decoder/attribute/isSwrite.scala +++ b/t1/src/decoder/attribute/isSwrite.scala @@ -134,8 +134,6 @@ object isSwrite { "vredor.vs", "vredsum.vs", "vredxor.vs", - "vrgather.vv", - "vrgatherei16.vv", "vs1r.v", "vs2r.v", "vs4r.v", diff --git a/t1/src/laneStage/LaneStage3.scala b/t1/src/laneStage/LaneStage3.scala index d6fb70eca..4854c0941 100644 --- a/t1/src/laneStage/LaneStage3.scala +++ b/t1/src/laneStage/LaneStage3.scala @@ -46,18 +46,11 @@ class LaneStage3(parameter: LaneParameter, isLastSlot: Boolean) extends Module { val pipeEnqueue: Option[LaneStage3Enqueue] = Option.when(isLastSlot)(RegInit(0.U.asTypeOf(enqueue.bits))) - /** response to [[T1.lsu]] or mask unit in [[T1]] */ @public - val laneResponse: Option[ValidIO[LaneResponse]] = Option.when(isLastSlot)(IO(Valid(new LaneResponse(parameter)))) - @public - val stageValid: Bool = IO(Output(Bool())) + val stageValid: Bool = IO(Output(Bool())) - /** feedback from [[T1]] to [[Lane]] for [[laneResponse]] */ - @public - val laneResponseFeedback: Option[ValidIO[LaneResponseFeedback]] = - Option.when(isLastSlot)(IO(Flipped(Valid(new LaneResponseFeedback(parameter))))) @public - val crossWritePort: Option[Vec[DecoupledIO[WriteBusData]]] = + val crossWritePort: Option[Vec[DecoupledIO[WriteBusData]]] = Option.when(isLastSlot)(IO(Vec(2, Decoupled(new WriteBusData(parameter))))) val stageValidReg: Option[Bool] = Option.when(isLastSlot)(RegInit(false.B)) @@ -68,20 +61,10 @@ class LaneStage3(parameter: LaneParameter, isLastSlot: Boolean) extends Module { /** schedule cross lane write MSB */ val sCrossWriteMSB: Option[Bool] = Option.when(isLastSlot)(RegInit(true.B)) - // state for response to scheduler - /** schedule send [[LaneResponse]] to scheduler */ - val sSendResponse: Option[Bool] = Option.when(isLastSlot)(RegInit(true.B)) - - /** wait scheduler send [[LaneResponseFeedback]] */ - val wResponseFeedback: Option[Bool] = Option.when(isLastSlot)(RegInit(true.B)) - // update register when(enqueue.fire) { pipeEnqueue.foreach(_ := enqueue.bits) (sCrossWriteLSB ++ sCrossWriteMSB).foreach(_ := !enqueue.bits.decodeResult(Decoder.crossWrite)) - (sSendResponse ++ wResponseFeedback).foreach( - _ := enqueue.bits.decodeResult(Decoder.scheduler) || enqueue.bits.sSendResponse - ) } // Used to cut off back pressure forward @@ -111,43 +94,18 @@ class LaneStage3(parameter: LaneParameter, isLastSlot: Boolean) extends Module { sendState(index) := true.B } } - // scheduler synchronization - val schedulerFinish: Bool = (sSendResponse ++ wResponseFeedback).reduce(_ && _) val dataSelect: Option[UInt] = Option.when(isLastSlot) { Mux( pipeEnqueue.get.decodeResult(Decoder.nr) || - (enqueue.bits.ffoByOtherLanes && pipeEnqueue.get.decodeResult(Decoder.ffo)) || - pipeEnqueue.get.decodeResult(Decoder.dontNeedExecuteInLane), + (enqueue.bits.ffoByOtherLanes && pipeEnqueue.get.decodeResult(Decoder.ffo)), pipeEnqueue.get.pipeData, pipeEnqueue.get.data ) } - // mask request - laneResponse.head.valid := stageValidReg.get && !sSendResponse.get - laneResponse.head.bits.data := Mux( - pipeEnqueue.get.decodeResult(Decoder.ffo), - pipeEnqueue.get.ffoIndex, - dataSelect.get - ) - laneResponse.head.bits.toLSU := pipeEnqueue.get.loadStore - laneResponse.head.bits.instructionIndex := pipeEnqueue.get.instructionIndex - laneResponse.head.bits.ffoSuccess := pipeEnqueue.get.ffoSuccess - laneResponse.head.bits.fpReduceValid.zip(pipeEnqueue.get.fpReduceValid).foreach { case (s, f) => s := f } - - sSendResponse.foreach(state => - when(laneResponse.head.valid) { - state := true.B - } - ) - wResponseFeedback.foreach(state => - when(laneResponseFeedback.head.valid) { - state := true.B - } - ) // enqueue write for last slot - vrfWriteQueue.io.enq.valid := stageValidReg.get && schedulerFinish && !pipeEnqueue.get.decodeResult(Decoder.sWrite) + vrfWriteQueue.io.enq.valid := stageValidReg.get && !pipeEnqueue.get.decodeResult(Decoder.sWrite) // UInt(5.W) + UInt(3.W), use `+` here vrfWriteQueue.io.enq.bits.vd := pipeEnqueue.get.vd + pipeEnqueue.get.groupCounter( @@ -165,8 +123,8 @@ class LaneStage3(parameter: LaneParameter, isLastSlot: Boolean) extends Module { /** Cross-lane writing is over */ val CrossLaneWriteOver: Bool = (sCrossWriteLSB ++ sCrossWriteMSB).reduce(_ && _) - enqueue.ready := !stageValidReg.get || (CrossLaneWriteOver && schedulerFinish && vrfWriteReady) - val dequeueFire = stageValidReg.get && CrossLaneWriteOver && schedulerFinish && vrfWriteReady + enqueue.ready := !stageValidReg.get || (CrossLaneWriteOver && vrfWriteReady) + val dequeueFire = stageValidReg.get && CrossLaneWriteOver && vrfWriteReady stageValidReg.foreach { data => when(dequeueFire ^ enqueue.fire) { data := enqueue.fire diff --git a/t1/src/laneStage/MaskExchangeUnit.scala b/t1/src/laneStage/MaskExchangeUnit.scala new file mode 100644 index 000000000..9ac45c363 --- /dev/null +++ b/t1/src/laneStage/MaskExchangeUnit.scala @@ -0,0 +1,65 @@ +// SPDX-License-Identifier: Apache-2.0 +// SPDX-FileCopyrightText: 2022 Jiuyang Liu + +package org.chipsalliance.t1.rtl.lane + +import chisel3._ +import chisel3.experimental.hierarchy.{instantiable, public} +import chisel3.util._ +import org.chipsalliance.t1.rtl._ + +@instantiable +class MaskExchangeUnit(parameter: LaneParameter) extends Module { + @public + val enqueue: DecoupledIO[LaneStage3Enqueue] = + IO(Flipped(Decoupled(new LaneStage3Enqueue(parameter, true)))) + + @public + val dequeue: DecoupledIO[LaneStage3Enqueue] = + IO(Decoupled(new LaneStage3Enqueue(parameter, true))) + + @public + val maskReq: DecoupledIO[MaskUnitExeReq] = IO(Decoupled(new MaskUnitExeReq(parameter))) + + @public + val maskRequestToLSU: Bool = IO(Output(Bool())) + + @public + val maskUnitResponse: ValidIO[MaskUnitExeResponse] = IO(Flipped(Valid(new MaskUnitExeResponse(parameter)))) + + // pipe reg + val requestPipeReq: LaneStage3Enqueue = RegInit(0.U.asTypeOf(enqueue.bits)) + val pipeValid: Bool = RegInit(false.B) + // todo: sSendResponse -> sendResponse + val enqIsMaskRequest: Bool = !enqueue.bits.sSendResponse + + // todo: connect mask request & response + maskReq.valid := enqIsMaskRequest && enqueue.valid + maskReq.bits.source1 := enqueue.bits.pipeData + maskReq.bits.source2 := enqueue.bits.data + maskReq.bits.groupCounter := enqueue.bits.groupCounter + maskReq.bits.index := enqueue.bits.instructionIndex + + maskRequestToLSU := enqueue.bits.loadStore + + // type change MaskUnitExeResponse -> LaneStage3Enqueue + val maskUnitResponsePipeType: LaneStage3Enqueue = WireDefault(requestPipeReq) + maskUnitResponsePipeType.groupCounter := maskUnitResponse.bits.writeData.groupCounter + maskUnitResponsePipeType.data := maskUnitResponse.bits.writeData.data + maskUnitResponsePipeType.mask := maskUnitResponse.bits.writeData.mask + maskUnitResponsePipeType.instructionIndex := maskUnitResponse.bits.index + maskUnitResponsePipeType.ffoByOtherLanes := enqueue.bits.ffoByOtherLanes + + val regEnq: Bool = (enqueue.fire && !enqIsMaskRequest) || maskUnitResponse.valid + val pipeRequest: Bool = enqueue.fire || maskUnitResponse.valid + when(pipeRequest) { + requestPipeReq := Mux(maskUnitResponse.valid, maskUnitResponsePipeType, enqueue.bits) + } + when(regEnq ^ dequeue.fire) { + pipeValid := regEnq + } + + enqueue.ready := ((!pipeValid || dequeue.ready) && !maskUnitResponse.valid) || enqIsMaskRequest + dequeue.valid := pipeValid + dequeue.bits := requestPipeReq +} diff --git a/t1/src/laneStage/SlotTokenManager.scala b/t1/src/laneStage/SlotTokenManager.scala index b2ed09a4e..d842b9051 100644 --- a/t1/src/laneStage/SlotTokenManager.scala +++ b/t1/src/laneStage/SlotTokenManager.scala @@ -101,6 +101,9 @@ class SlotTokenManager(parameter: LaneParameter) extends Module { @public val dataInWritePipe: UInt = IO(Output(UInt(parameter.chainingSize.W))) + @public + val maskUnitLastReport: UInt = IO(Input(UInt(parameter.chainingSize.W))) + def tokenUpdate(tokenData: Seq[UInt], enqWire: UInt, deqWire: UInt): UInt = { tokenData.zipWithIndex.foreach { case (t, i) => val e = enqWire(i) @@ -114,12 +117,15 @@ class SlotTokenManager(parameter: LaneParameter) extends Module { } // todo: Precise feedback - def feedbackUpdate(tokenData: Seq[UInt], enqWire: UInt, deqWire: UInt): UInt = { + def feedbackUpdate(tokenData: Seq[UInt], enqWire: UInt, deqWire: UInt, clear: UInt): UInt = { tokenData.zipWithIndex.foreach { case (t, i) => val e = enqWire(i) val d = deqWire(i) + val c = clear(i) val change = Mux(e, 1.U(tokenWith.W), -1.S(tokenWith.W).asUInt) - when((e ^ d) && (e || t =/= 0.U)) { + when(c) { + t := 0.U + }.elsewhen((e ^ d) && (e || t =/= 0.U)) { t := t + change } } @@ -132,7 +138,11 @@ class SlotTokenManager(parameter: LaneParameter) extends Module { val enqOH = indexToOH(enqReport.bits.instructionIndex, parameter.chainingSize) val writeDoEnq: UInt = - maskAnd(enqReport.valid && !enqReport.bits.decodeResult(Decoder.sWrite), enqOH).asUInt + maskAnd( + enqReport.valid && !enqReport.bits.decodeResult(Decoder.sWrite) && + !enqReport.bits.decodeResult(Decoder.maskUnit), + enqOH + ).asUInt val writeDoDeq: UInt = maskAnd( @@ -140,7 +150,9 @@ class SlotTokenManager(parameter: LaneParameter) extends Module { indexToOH(slotWriteReport(slotIndex).bits, parameter.chainingSize) ).asUInt - val pendingSlotWrite = tokenUpdate(writeToken, writeDoEnq, writeDoDeq) + val writeEnqSelect: UInt = Wire(UInt(parameter.chainingSize.W)) + + val pendingSlotWrite = tokenUpdate(writeToken, writeEnqSelect, writeDoDeq) if (slotIndex == 0) { val responseToken: Seq[UInt] = Seq.tabulate(parameter.chainingSize)(_ => RegInit(0.U(tokenWith.W))) @@ -181,13 +193,16 @@ class SlotTokenManager(parameter: LaneParameter) extends Module { responseFeedbackReport.bits ) val feedbackDoDeq: UInt = - maskAnd(responseFeedbackReport.valid, indexToOH(feedbackIndexSelect, parameter.chainingSize)).asUInt + maskAnd(responseFeedbackReport.valid, indexToOH(responseFeedbackReport.bits, parameter.chainingSize)).asUInt + + writeEnqSelect := writeDoEnq | feedbackDoDeq val pendingResponse = tokenUpdate(responseToken, responseDoEnq, responseDoDeq) // todo: Precise feedback - val pendingFeedback = feedbackUpdate(feedbackToken, responseDoEnq, feedbackDoDeq) + val pendingFeedback = feedbackUpdate(feedbackToken, responseDoEnq, feedbackDoDeq, maskUnitLastReport) pendingSlotWrite | pendingCrossWriteLSB | pendingCrossWriteMSB | pendingResponse | pendingFeedback } else { + writeEnqSelect := writeDoEnq pendingSlotWrite } }.reduce(_ | _) diff --git a/t1/src/mask/MaskCompress.scala b/t1/src/mask/MaskCompress.scala new file mode 100644 index 000000000..651724380 --- /dev/null +++ b/t1/src/mask/MaskCompress.scala @@ -0,0 +1,123 @@ +// SPDX-License-Identifier: Apache-2.0 +// SPDX-FileCopyrightText: 2022 Jiuyang Liu + +package org.chipsalliance.t1.rtl + +import chisel3._ +import chisel3.util._ + +class CompressInput(parameter: T1Parameter) extends Bundle { + val maskType: Bool = Bool() + val eew: UInt = UInt(2.W) + val uop: UInt = UInt(3.W) + val readFromScalar: UInt = UInt(parameter.datapathWidth.W) + val source1: UInt = UInt((parameter.laneNumber * parameter.datapathWidth).W) + val source2: UInt = UInt((parameter.laneNumber * parameter.datapathWidth).W) + val groupCounter: UInt = UInt(parameter.laneParam.groupNumberBits.W) + val lastCompress: Bool = Bool() +} + +class CompressOutput(parameter: T1Parameter) extends Bundle { + val data: UInt = UInt((parameter.laneNumber * parameter.datapathWidth).W) + val mask: UInt = UInt((parameter.laneNumber * parameter.datapathWidth / 8).W) + val compressValid: Bool = Bool() +} + +class MaskCompress(parameter: T1Parameter) extends Module { + val in: ValidIO[CompressInput] = IO(Flipped(Valid(new CompressInput(parameter)))) + val out: CompressOutput = IO(Output(new CompressOutput(parameter))) + val newInstruction: Bool = IO(Input(Bool())) + + val maskSize: Int = parameter.laneNumber * parameter.datapathWidth / 8 + + // Source1 alignment + val source1Aligned: UInt = Wire(UInt(maskSize.W)) + // TODO: Align and align in advance + source1Aligned := in.bits.source1 + val compress = in.bits.uop === "b001".U + val viota = in.bits.uop === "b000".U + val mv = in.bits.uop === "b101".U + + val eew1H: UInt = UIntToOH(in.bits.eew)(2, 0) + val compressInit: UInt = RegInit(0.U(log2Ceil(parameter.vLen).W)) + val compressVec: Vec[UInt] = Wire(Vec(maskSize, UInt(compressInit.getWidth.W))) + val compressMaskVec: Seq[Bool] = source1Aligned.asBools + val compressCount: UInt = compressMaskVec.zipWithIndex.foldLeft(compressInit) { case (pre, (mask, index)) => + compressVec(index) := pre + pre + mask + } + // todo: compress update + compressInit := Mux(newInstruction, 0.U, compressCount) + + val viotaResult: UInt = Mux1H( + eew1H, + Seq(1, 2, 4).map { eew => + VecInit(Seq.tabulate(parameter.laneNumber) { index => + // data width: eew * 8, data path 32, need [4 / eew] element + val dataSize = 4 / eew + val res: Seq[UInt] = Seq.tabulate(dataSize) { i => + UIntWithSize(compressVec(dataSize * index + i), eew * 8) + } + // each data path + VecInit(res).asUInt + }).asUInt + } + ) + val viotaMask: UInt = Mux1H( + eew1H, + Seq(1, 2, 4).map { eew => + VecInit(Seq.tabulate(parameter.laneNumber) { index => + val dataSize = 4 / eew + val res: Seq[UInt] = Seq.tabulate(dataSize) { i => + Fill(eew, compressMaskVec(dataSize * index + i)) + } + // 4 bit mask + VecInit(res).asUInt + }).asUInt + } + ) + + val tailCount = compressInit + val compressDataReg = RegInit(0.U((parameter.laneNumber * parameter.datapathWidth).W)) + val compressDataVec = Seq(1, 2, 4).map { eew => + VecInit(Seq.tabulate(parameter.laneNumber * 2) { index => + val useTail = index.U < tailCount + val tailData = cutUInt(compressDataReg, eew)(index) + val maskSize = 4 * parameter.laneNumber / eew + val hitReq = Seq.tabulate(maskSize)(maskIndex => compressVec(maskIndex) === index.U) + val selectReqData = Mux1H( + hitReq, + cutUInt(in.bits.source2, eew) + ) + Mux(useTail, tailData, selectReqData) + }).asUInt + } + val compressResult: UInt = Mux1H(eew1H, compressDataVec) + + // todo: connect & update compressInit + val compressTailMask = Wire(UInt(out.mask.getWidth.W)) + compressTailMask := DontCare + + val mvMask = Mux1H(eew1H, Seq(1.U, 3.U, 15.U)) + val mvData = in.bits.readFromScalar + + out.data := Mux1H( + Seq( + compress -> compressResult, + viota -> viotaResult, + mv -> mvData + ) + ) + + // todo: compressMask + out.mask := Mux1H( + Seq( + compress -> compressTailMask, + viota -> viotaMask, + mv -> mvMask + ) + ) + + // todo + out.compressValid := false.B +} diff --git a/t1/src/mask/MaskExtend.scala b/t1/src/mask/MaskExtend.scala new file mode 100644 index 000000000..27a27f9ea --- /dev/null +++ b/t1/src/mask/MaskExtend.scala @@ -0,0 +1,69 @@ +// SPDX-License-Identifier: Apache-2.0 +// SPDX-FileCopyrightText: 2022 Jiuyang Liu + +package org.chipsalliance.t1.rtl + +import chisel3._ +import chisel3.util._ + +class ExtendInput(parameter: T1Parameter) extends Bundle { + val eew: UInt = UInt(2.W) + val uop: UInt = UInt(3.W) + val source2: UInt = UInt((parameter.laneNumber * parameter.datapathWidth).W) + val groupCounter: UInt = UInt(parameter.laneParam.groupNumberBits.W) +} + +class MaskExtend(parameter: T1Parameter) extends Module { + val in: ExtendInput = IO(Input(new ExtendInput(parameter))) + val out: UInt = IO(Output(UInt(parameter.datapathWidth.W))) + + val eew1H: UInt = UIntToOH(in.eew)(2, 0) + + val isMaskDestination: Bool = in.uop.andR + val maskDestinationResult: UInt = Mux1H( + eew1H, + Seq(4, 2, 1).map { groupSize => + VecInit( + cutUInt(in.source2, groupSize).grouped(parameter.laneNumber).toSeq.transpose.map(a => VecInit(a).asUInt) + ).asUInt + } + ) + + // extend + val sign: Bool = in.uop(0) + // extend ratio + // todo: Currently only vf2 and vf4 + val extendRatio: Bool = in.uop(1) + + // select source2 + // extendRatio: 0 -> vf2; 1-> vf4 + val source2: UInt = Mux( + extendRatio, + Mux1H( + UIntToOH(in.groupCounter(1, 0)), + cutUInt(in.source2, parameter.laneNumber * parameter.datapathWidth / 4) + ), + Mux1H( + UIntToOH(in.groupCounter(0)), + cutUInt(in.source2, parameter.laneNumber * parameter.datapathWidth / 2) + ) + ) + + val extendResult: UInt = Mux1H( + eew1H(2, 1), + Seq(2, 4).map { dataWidth => + Mux1H( + UIntToOH(extendRatio), + Seq(2, 4).map { ratio => + val resWidth = dataWidth * 8 + val sourceWidth = resWidth / ratio + VecInit(cutUInt(source2, sourceWidth).map { sourceData => + Fill(resWidth - sourceWidth, sourceData(sourceWidth - 1) && sign) ## sourceData + }).asUInt + } + ) + } + ) + + out := Mux(isMaskDestination, maskDestinationResult, extendResult) +} diff --git a/t1/src/mask/MaskReduce.scala b/t1/src/mask/MaskReduce.scala new file mode 100644 index 000000000..afee1bd88 --- /dev/null +++ b/t1/src/mask/MaskReduce.scala @@ -0,0 +1,186 @@ +// SPDX-License-Identifier: Apache-2.0 +// SPDX-FileCopyrightText: 2022 Jiuyang Liu + +package org.chipsalliance.t1.rtl + +import chisel3._ +import chisel3.experimental.hierarchy.{Instance, Instantiate} +import chisel3.util._ + +class ReduceInput(parameter: T1Parameter) extends Bundle { + val maskType: Bool = Bool() + val eew: UInt = UInt(2.W) + val uop: UInt = UInt(3.W) + val readVS1: UInt = UInt(parameter.datapathWidth.W) + val source2: UInt = UInt((parameter.laneNumber * parameter.datapathWidth).W) + val sourceValid: UInt = UInt(parameter.laneNumber.W) + val groupCounter: UInt = UInt(parameter.laneParam.groupNumberBits.W) + val lastGroup: Bool = Bool() + val vxrm: UInt = UInt(3.W) + val aluUop: UInt = UInt(4.W) + val sign: Bool = Bool() +} + +class ReduceOutput(parameter: T1Parameter) extends Bundle { + val data: UInt = UInt(parameter.datapathWidth.W) +} + +class MaskReduce(parameter: T1Parameter) extends Module { + val in: DecoupledIO[ReduceInput] = IO(Flipped(Decoupled(new ReduceInput(parameter)))) + val out: ValidIO[ReduceOutput] = IO(Valid(new ReduceOutput(parameter))) + val newInstruction: Bool = IO(Input(Bool())) + + val maskSize: Int = parameter.laneNumber * parameter.datapathWidth / 8 + + // todo: uop decode + val order: Bool = in.bits.uop === "b101".U + val reqWiden: Bool = in.bits.uop === "b001".U + + val eew1H: UInt = UIntToOH(in.bits.eew)(2, 0) + val nextFoldCount: Bool = eew1H(0) && !reqWiden + + // reduce function unit + val adder: Instance[ReduceAdder] = Instantiate(new ReduceAdder(parameter.datapathWidth)) + val logicUnit: Instance[LaneLogic] = Instantiate(new LaneLogic(parameter.datapathWidth)) + // option unit for flot reduce + val floatAdder: Option[Instance[FloatAdder]] = + Option.when(parameter.fpuEnable)(Instantiate(new FloatAdder(8, 24))) + val flotCompare: Option[Instance[FloatCompare]] = + Option.when(parameter.fpuEnable)(Instantiate(new FloatCompare(8, 24))) + + // init reg + val reduceInit: UInt = RegInit(0.U(parameter.datapathWidth.W)) + val reduceResult: UInt = Wire(UInt(parameter.datapathWidth.W)) + val crossFoldCount: UInt = RegInit(0.U(log2Ceil(parameter.laneNumber).W)) + val lastFoldCount: Bool = RegInit(false.B) + + val reqReg = RegEnable(in.bits, 0.U.asTypeOf(in.bits), in.fire) + // todo: handle reqReg.sourceValid + val groupLastReduce = crossFoldCount.andR + val lastFoldEnd = !lastFoldCount + val outValid: Bool = WireDefault(false.B) + // todo: skip float reduce + val skipFlotReduce: Bool = WireDefault(false.B) + + val eew1HReg: UInt = UIntToOH(reqReg.eew)(2, 0) + val floatType: Bool = reqReg.uop(2) + val NotAdd: Bool = reqReg.uop(1) + val widen: Bool = reqReg.uop === "b001".U + val needFold: Bool = eew1HReg(0) || (eew1HReg(1) && !widen) + + // crossFold: reduce between lane + // lastFold: reduce in data path + // orderRed: order reduce + val idle :: crossFold :: lastFold :: orderRed :: Nil = Enum(4) + val state: UInt = RegInit(idle) + + val stateIdle: Bool = state === idle + val stateCross: Bool = state === crossFold + val stateLast: Bool = state === lastFold + val stateOrder: Bool = state === orderRed + + // state update + in.ready := stateIdle + when(stateIdle) { + when(in.valid) { + state := Mux(order, orderRed, crossFold) + } + } + + when(stateCross) { + when(groupLastReduce) { + state := Mux(reqReg.lastGroup && needFold, lastFold, idle) + outValid := reqReg.lastGroup && !needFold + } + } + + when(stateOrder) { + when(groupLastReduce) { + state := idle + outValid := reqReg.lastGroup + } + } + + when(stateLast) { + when(lastFoldEnd) { + state := idle + outValid := true.B + } + } + + when(newInstruction) { + // todo: update reduceInit when first in.fire + reduceInit := in.bits.readVS1 + crossFoldCount := 0.U + lastFoldCount := nextFoldCount + } + + // count update + // todo: stateCross <=> stateOrder ?? + when(stateCross || stateOrder || in.fire) { + crossFoldCount := Mux(in.fire, 0.U, crossFoldCount + 1.U) + } + + // result update + when(!stateIdle) { + reduceInit := reduceResult + } + + when(stateLast) { + lastFoldCount := false.B + } + + val selectLaneResult: UInt = Mux1H( + UIntToOH(crossFoldCount), + cutUInt(reqReg.source2, parameter.datapathWidth) + ) + val reduceDataVec = cutUInt(reduceInit, 8) + // reduceFoldCount = false => abcd -> xxab | xxcd -> mask 0011 + // reduceFoldCount = true => abcd -> xaxc | xbxd -> mask 0101 + val lastFoldSource1: UInt = Mux( + lastFoldCount, + reduceDataVec(3) ## reduceDataVec(3) ## reduceDataVec(1), + reduceDataVec(3) ## reduceDataVec(3) ## reduceDataVec(2) + ) + val source2Select: UInt = Mux(stateCross || stateOrder, selectLaneResult, lastFoldSource1) + + // popCount 在top视为reduce add + adder.request.src := VecInit(Seq(reduceInit, source2Select)) + // todo: pop + adder.request.opcode := reqReg.aluUop(2) + adder.request.sign := reqReg.sign + adder.request.vSew := reqReg.eew + + floatAdder.foreach { fAdder => + fAdder.io.a := reduceInit + fAdder.io.b := source2Select + fAdder.io.roundingMode := reqReg.vxrm + } + + flotCompare.foreach { fCompare => + fCompare.io.a := reduceInit + fCompare.io.b := source2Select + // max -> 12, min -> 8 + fCompare.io.isMax := reqReg.aluUop(2) + } + + logicUnit.req.src := VecInit(Seq(reduceInit, source2Select)) + logicUnit.req.opcode := reqReg.aluUop + + val flotReduceResult: Option[UInt] = Option.when(parameter.fpuEnable)( + Mux( + skipFlotReduce, + reduceInit, + Mux(NotAdd, flotCompare.get.io.out, floatAdder.get.io.out) + ) + ) + // select result + reduceResult := Mux( + floatType, + flotReduceResult.getOrElse(adder.response.data), + Mux(NotAdd, logicUnit.resp, adder.response.data) + ) + + out.valid := outValid + out.bits.data := reduceResult +} diff --git a/t1/src/mask/MaskUnit.scala b/t1/src/mask/MaskUnit.scala new file mode 100644 index 000000000..5985a83fd --- /dev/null +++ b/t1/src/mask/MaskUnit.scala @@ -0,0 +1,580 @@ +// SPDX-License-Identifier: Apache-2.0 +// SPDX-FileCopyrightText: 2022 Jiuyang Liu + +package org.chipsalliance.t1.rtl + +import chisel3._ +import chisel3.experimental.hierarchy.{instantiable, public} +import chisel3.util._ +import org.chipsalliance.t1.rtl.decoder.Decoder + +// top uop decode +// uu ii x -> uu: unit index; ii: Internal encoding, x: additional encode + +// slid & gather unit, need read vrf in mask unit(00) +// 00 00 x -> slid; x? up: down +// 00 01 x -> slid1; x? up: down +// 00 10 x -> gather; x? 16 : sew todo:(multi address check/ index -> data cache?) + +// compress & viota unit & vmv(01) +// These instructions cannot extend their execution width indefinitely. +// 01 00 x -> x ? compress : viota +// 01 01 x -> vmv; x: write rd ? + +// reduce unit(10) n + 8 + m -> n + 3 + m // Folded into datapath, then folded into sew +// The Reduce instruction folds the data. +// Considering the sequential addition, a state machine is needed to control it. +// 10 00 x -> adder; x: widen reduce? +// 10 01 x -> logic; x: dc +// 10 10 x -> floatAdder; x: order? +// 10 11 x -> flotCompare; x: dc + +// extend unit & maskdestination(11) +// These instructions write an entire data path each time they are executed. +// 11 mm x -> s(z)ext; mm: multiple(00, 01, 10); x ? sign : zero +// 11 11 1 -> maskdestination +@instantiable +class MaskUnit(parameter: T1Parameter) extends Module { + // todo: param + val readQueueSize: Int = 4 + val readVRFLatency: Int = 2 + val maskUnitWriteQueueSize: Int = 8 + + @public + val instReq: ValidIO[MaskUnitInstReq] = IO(Flipped(Valid(new MaskUnitInstReq(parameter)))) + + @public + val exeReq: Seq[DecoupledIO[MaskUnitExeReq]] = Seq.tabulate(parameter.laneNumber) { _ => + IO(Flipped(Decoupled(new MaskUnitExeReq(parameter.laneParam)))) + } + + @public + val exeResp: Seq[ValidIO[MaskUnitExeResponse]] = Seq.tabulate(parameter.laneNumber) { _ => + IO(Valid(new MaskUnitExeResponse(parameter.laneParam))) + } + + @public + val readChannel: Seq[DecoupledIO[VRFReadRequest]] = Seq.tabulate(parameter.laneNumber) { _ => + IO( + Decoupled( + new VRFReadRequest( + parameter.vrfParam.regNumBits, + parameter.laneParam.vrfOffsetBits, + parameter.instructionIndexBits + ) + ) + ) + } + + @public + val readResult: Seq[UInt] = Seq.tabulate(parameter.laneNumber) { _ => + IO(Input(UInt(parameter.datapathWidth.W))) + } + + @public + val writeRD: ValidIO[UInt] = IO(Valid(UInt(parameter.datapathWidth.W))) + + /** input of mask data */ + @public + val maskInput: UInt = IO(Input(UInt(parameter.maskGroupWidth.W))) + + /** select which mask group. */ + @public + val maskSelect: UInt = IO(Output(UInt(parameter.laneParam.maskGroupSizeBits.W))) + + @public + val lastReport: UInt = IO(Output(UInt(parameter.chainingSize.W))) + + val instReg: MaskUnitInstReq = RegEnable(instReq.bits, 0.U.asTypeOf(instReq.bits), instReq.valid) + val sew1H: UInt = UIntToOH(instReg.sew)(2, 0) + val lastExecuteIndex: UInt = Mux1H(sew1H, Seq(3.U(2.W), 2.U(2.W), 0.U(2.W))) + + // calculate last group + val readDataEew1H: UInt = sew1H + val lastElementIndex: UInt = (instReg.vl - instReg.vl.orR)(parameter.laneParam.vlMaxBits - 2, 0) + val laneNumberBits: Int = 1.max(log2Ceil(parameter.laneNumber)) + + /** For an instruction, the last group is not executed by all lanes, here is the last group of the instruction xxxxx + * xxx xx -> vsew = 0 xxxxxx xxx x -> vsew = 1 xxxxxxx xxx -> vsew = 2 + */ + val lastGroupForInstruction: UInt = Mux1H( + readDataEew1H, + Seq( + lastElementIndex(parameter.laneParam.vlMaxBits - 2, laneNumberBits + 2), + lastElementIndex(parameter.laneParam.vlMaxBits - 2, laneNumberBits + 1), + lastElementIndex(parameter.laneParam.vlMaxBits - 2, laneNumberBits) + ) + ) + + /** Which lane the last element is in. */ + val lastLaneIndex: UInt = Mux1H( + readDataEew1H, + Seq( + lastElementIndex(laneNumberBits + 2 - 1, 2), + lastElementIndex(laneNumberBits + 1 - 1, 1), + lastElementIndex(laneNumberBits - 1, 0) + ) + ) + val lastGroupDataNeed: UInt = scanRightOr(UIntToOH(lastLaneIndex)) + + // from decode + val unitType: UInt = UIntToOH(instReg.decodeResult(Decoder.topUop)(4, 3)) + val readType: Bool = unitType(0) + val gather16: Bool = instReg.decodeResult(Decoder.topUop) === "b00101".U + + val sewCorrection: UInt = Mux(gather16, 1.U, instReg.sew) + + val exeRequestQueue: Seq[Queue[MaskUnitExeReq]] = exeReq.map { req => + // todo: max or token? + val queue: Queue[MaskUnitExeReq] = Module(new Queue(chiselTypeOf(req.bits), 16, flow = true)) + queue.io.enq.valid := req.valid + req.ready := queue.io.enq.ready + queue.io.enq.bits := req.bits + queue + } + + val exeReqReg: Seq[ValidIO[MaskUnitExeReq]] = Seq.tabulate(parameter.laneNumber) { _ => + RegInit(0.U.asTypeOf(Valid(new MaskUnitExeReq(parameter.laneParam)))) + } + val lastGroup: Bool = exeReqReg.head.bits.groupCounter === lastGroupForInstruction + // todo: mask + val groupDataNeed: UInt = Mux(lastGroup, lastGroupDataNeed, (-1.S(parameter.laneNumber.W)).asUInt) + // For read type, only sew * laneNumber data will be consumed each time + // There will be a maximum of (dataPath * laneNumber) / (sew * laneNumber) times + val executeIndex: UInt = RegInit(0.U(2.W)) + // The status of an execution + // Each execution ends with executeIndex + 1 + val readGroupState: MaskUnitExecuteState = RegInit(0.U.asTypeOf(new MaskUnitExecuteState(parameter))) + val executeStateValid: Bool = RegInit(false.B) + + def indexAnalysis(sewInt: Int)(elementIndex: UInt, vlmul: UInt, valid: Option[Bool] = None): Seq[UInt] = { + val intLMULInput: UInt = (1.U << vlmul(1, 0)).asUInt + val positionSize = parameter.laneParam.vlMaxBits - 1 + val dataPosition = (changeUIntSize(elementIndex, positionSize) << sewInt).asUInt(positionSize - 1, 0) + val accessMask: UInt = Seq( + UIntToOH(dataPosition(1, 0)), + FillInterleaved(2, UIntToOH(dataPosition(1))), + 15.U(4.W) + )(sewInt) + // The offset of the data starting position in 32 bits (currently only 32). + // Since the data may cross lanes, it will be optimized during fusion. + // (dataPosition(1) && sewOHInput(1, 0).orR) ## (dataPosition(0) && sewOHInput(0)) + val dataOffset: UInt = + (if (sewInt < 2) dataPosition(1) else false.B) ## + (if (sewInt == 0) dataPosition(0) else false.B) + val accessLane = if (parameter.laneNumber > 1) dataPosition(log2Ceil(parameter.laneNumber) + 1, 2) else 0.U(1.W) + // 32 bit / group + val dataGroup = (dataPosition >> (log2Ceil(parameter.laneNumber) + 2)).asUInt + val offsetWidth: Int = parameter.laneParam.vrfParam.vrfOffsetBits + val offset = dataGroup(offsetWidth - 1, 0) + val accessRegGrowth = (dataGroup >> offsetWidth).asUInt + val decimalProportion = offset ## accessLane + // 1/8 register + val decimal = decimalProportion(decimalProportion.getWidth - 1, 0.max(decimalProportion.getWidth - 3)) + + /** elementIndex needs to be compared with vlMax(vLen * lmul /sew) This calculation is too complicated We can change + * the angle. Calculate the increment of the read register and compare it with lmul to know whether the index + * exceeds vlMax. vlmul needs to distinguish between integers and floating points + */ + val overlap = + (vlmul(2) && decimal >= intLMULInput(3, 1)) || + (!vlmul(2) && accessRegGrowth >= intLMULInput) || + (elementIndex >> log2Ceil(parameter.vLen)).asUInt.orR + val elementValid = valid.getOrElse(true.B) + val notNeedRead = overlap || !elementValid + val reallyGrowth: UInt = changeUIntSize(accessRegGrowth, 3) + Seq(accessMask, dataOffset, accessLane, offset, reallyGrowth, notNeedRead, elementValid) + } + + // datapath bit per mask group + // laneNumber bit per execute group + val executeGroup: UInt = Mux1H( + sew1H, + Seq( + exeReqReg.head.bits.groupCounter ## executeIndex, + exeReqReg.head.bits.groupCounter ## executeIndex(1), + exeReqReg.head.bits.groupCounter + ) + ) + + val executeSizeBit: Int = log2Ceil(parameter.laneNumber) + val vlMisAlign = instReg.vl(executeSizeBit - 1, 0).orR + val lastexecuteGroup: UInt = (instReg.vl >> executeSizeBit).asUInt - !vlMisAlign + val isVlBoundary: Bool = executeGroup === lastexecuteGroup + val validExecuteGroup: Bool = executeGroup <= lastexecuteGroup + val vlBoundaryCorrection: UInt = Mux( + vlMisAlign && isVlBoundary, + (~scanLeftOr(UIntToOH(instReg.vl(executeSizeBit - 1, 0)))).asUInt, + -1.S(parameter.laneNumber.W).asUInt + ) & Fill(parameter.laneNumber, validExecuteGroup) + + maskSelect := executeGroup >> log2Ceil(parameter.datapathWidth / parameter.laneNumber) + val selectReadStageMask: UInt = cutUIntBySize(maskInput, 4)(executeGroup(1, 0)) + val maskCorrection: UInt = + Mux(instReg.maskType, selectReadStageMask, -1.S(parameter.laneNumber.W).asUInt) & + vlBoundaryCorrection + + val checkVec: Seq[Seq[UInt]] = Seq(0, 1, 2).map { sewInt => + val dataByte = 1 << sewInt + // All data of this group + val groupSourceData: UInt = VecInit(exeReqReg.map(_.bits.source1)).asUInt + val groupSourceValid: UInt = VecInit(exeReqReg.map(_.valid)).asUInt + // Single use length + val singleWidth = dataByte * 8 * parameter.laneNumber + // How many times will a set of data be executed? + val executeTimes = (parameter.datapathWidth / 8) / dataByte + // Which part is selected as the source data this time? + val executeDataSelect1H: UInt = if (sewInt == 0) { + UIntToOH(executeIndex) + } else if (sewInt == 1) { + UIntToOH(executeIndex(1)) + } else { + true.B + } + // Select source data + val sourceSelect = Mux1H( + executeDataSelect1H, + cutUInt(groupSourceData, singleWidth) + ) + val validSelect: UInt = Mux1H( + executeDataSelect1H, + cutUInt(groupSourceValid, singleWidth / parameter.datapathWidth) + ) + + // The length of an element + val dataWidth = 8 * dataByte + // Split into elements + val source = cutUInt(sourceSelect, dataWidth) + val validVec = FillInterleaved(parameter.datapathWidth / dataWidth, validSelect) & maskCorrection + // read index check + // (accessMask, dataOffset, accessLane, offset, reallyGrowth, overlap) + val checkResultVec: Seq[Seq[UInt]] = source.zipWithIndex.map { case (s, i) => + indexAnalysis(sewInt)(s, instReg.vlmul, Some(validVec(i))) + } + val checkResult = checkResultVec.transpose.map(a => VecInit(a).asUInt) + checkResult + } + val sewCorrection1H: UInt = UIntToOH(sewCorrection)(2, 0) + val dataOffsetSelect: UInt = Mux1H(sewCorrection1H, checkVec.map(_(1))) + val accessLaneSelect: UInt = Mux1H(sewCorrection1H, checkVec.map(_(2))) + val offsetSelect: UInt = Mux1H(sewCorrection1H, checkVec.map(_(3))) + val growthSelect: UInt = Mux1H(sewCorrection1H, checkVec.map(_(4))) + val notReadSelect: UInt = Mux1H(sewCorrection1H, checkVec.map(_(5))) + val elementValidSelect: UInt = Mux1H(sewCorrection1H, checkVec.map(_(6))) + + val readCrossBar: MaskUnitReadCrossBar = Module(new MaskUnitReadCrossBar(parameter)) + + // The queue waiting to read data. This queue contains other information about this group. + // 64: todo: max or token? + val readWaitQueue: Queue[MaskUnitWaitReadQueue] = + Module(new Queue(new MaskUnitWaitReadQueue(parameter), 64)) + + // s0 pipe request from lane + val laseExecuteGroupDeq: Bool = Wire(Bool()) + exeRequestQueue.zip(exeReqReg).foreach { case (req, reg) => + req.io.deq.ready := !reg.valid || laseExecuteGroupDeq + when(req.io.deq.fire) { + reg.bits := req.io.deq.bits + } + when(req.io.deq.fire ^ laseExecuteGroupDeq) { + reg.valid := req.io.deq.fire + } + } + + val isLastExecuteGroup: Bool = executeIndex === lastExecuteIndex + val allDataValid: Bool = exeReqReg.zipWithIndex.map { case (d, i) => d.valid || !groupDataNeed(i) }.reduce(_ && _) + val canIssueGroup: Bool = allDataValid && readWaitQueue.io.enq.ready + + // select execute group + val selectExecuteReq: Seq[ValidIO[MaskUnitReadReq]] = exeReqReg.zipWithIndex.map { case (_, index) => + val res: ValidIO[MaskUnitReadReq] = WireInit(0.U.asTypeOf(Valid(new MaskUnitReadReq(parameter)))) + res.bits.vs := instReg.vs2 + readGroupState.vsGrowth(index) + res.bits.offset := readGroupState.readOffset(index) + res.bits.readLane := readGroupState.accessLane(index) + res.bits.dataOffset := cutUIntBySize(readGroupState.readDataOffset, parameter.laneNumber)(index) + res.bits.requestIndex := index.U + res.valid := executeStateValid && !readGroupState.groupReadState(index) && readGroupState.needRead(index) + res + } + + // read arbitration + readCrossBar.input.zip(selectExecuteReq).foreach { case (cross, req) => + cross.valid := req.valid + cross.bits := req.bits + } + + // read control register update + val readFire: UInt = VecInit(readCrossBar.input.map(_.fire)).asUInt + val anyReadFire: Bool = readFire.orR + val readStateUpdate: UInt = readFire | readGroupState.groupReadState + val groupReadFinish: Bool = readStateUpdate === readGroupState.needRead + val readStateDeq: Bool = (anyReadFire && groupReadFinish) || (executeStateValid && readGroupState.needRead === 0.U) + val executeStateEnq: Bool = allDataValid && (readStateDeq || !executeStateValid) + when(anyReadFire) { + readGroupState.groupReadState := readStateUpdate + } + + when(readStateDeq ^ executeStateEnq) { + executeStateValid := executeStateEnq + } + + val executeIndexGrowth: UInt = (1.U << instReg.sew).asUInt + when(executeStateEnq) { + readGroupState.groupReadState := 0.U + readGroupState.needRead := (~notReadSelect).asUInt + readGroupState.elementValid := elementValidSelect + readGroupState.accessLane := cutUIntBySize(accessLaneSelect, parameter.laneNumber) + readGroupState.vsGrowth := cutUIntBySize(growthSelect, parameter.laneNumber) + readGroupState.readOffset := offsetSelect + readGroupState.groupCount := exeReqReg.head.bits.groupCounter + readGroupState.executeIndex := executeIndex + readGroupState.readDataOffset := dataOffsetSelect + readGroupState.last := isVlBoundary + executeIndex := executeIndex + executeIndexGrowth + } + + readWaitQueue.io.enq.valid := readStateDeq + readWaitQueue.io.enq.bits.groupCounter := readGroupState.groupCount + readWaitQueue.io.enq.bits.executeIndex := readGroupState.executeIndex + readWaitQueue.io.enq.bits.sourceValid := readGroupState.elementValid + readWaitQueue.io.enq.bits.needRead := readGroupState.needRead + readWaitQueue.io.enq.bits.last := readGroupState.last + + laseExecuteGroupDeq := Mux(readType, executeStateEnq, readWaitQueue.io.enq.fire) && isLastExecuteGroup + + // s1 read vrf + val write1HPipe: Vec[UInt] = Wire(Vec(parameter.laneNumber, UInt(parameter.laneNumber.W))) + val pipeDataOffset: Vec[UInt] = Wire(Vec(parameter.laneNumber, UInt(log2Ceil(parameter.datapathWidth / 8).W))) + + readCrossBar.output.zipWithIndex.foreach { case (request, index) => + val sourceLane = UIntToOH(request.bits.writeIndex) + readChannel(index).valid := request.valid + readChannel(index).bits.readSource := 2.U + readChannel(index).bits.vs := request.bits.vs + readChannel(index).bits.offset := request.bits.offset + readChannel(index).bits.instructionIndex := instReg.instructionIndex + request.ready := readChannel(index).ready + + // pipe read fire + val pipeRead = Pipe(readChannel(index).fire, sourceLane, readVRFLatency) + val pipeOffset = Pipe(readChannel(index).fire, request.bits.dataOffset, readVRFLatency) + write1HPipe(index) := Mux(pipeRead.valid, pipeRead.bits, 0.U(parameter.laneNumber.W)) + pipeDataOffset(index) := pipeOffset.bits + } + + // Processing read results + val readData: Seq[DecoupledIO[UInt]] = Seq.tabulate(parameter.laneNumber) { index => + // todo: assert enq.read & use token + val readDataQueue = Module(new Queue(UInt(parameter.datapathWidth.W), 4, flow = true)) + val readResultSelect = VecInit(write1HPipe.map(_(index))).asUInt + val dataOffset: UInt = Mux1H(readResultSelect, pipeDataOffset) + readDataQueue.io.enq.valid := readResultSelect.orR + readDataQueue.io.enq.bits := Mux1H(readResultSelect, readResult) >> (dataOffset ## 0.U(3.W)) + readDataQueue.io.deq + } + + /** todo: [[waiteReadDataPipeReg]] enq && [[readWaitQueue]] enq * */ + // reg before execute + val waiteReadDataPipeReg: MaskUnitWaitReadQueue = RegInit(0.U.asTypeOf(new MaskUnitWaitReadQueue(parameter))) + val waiteReadData: Seq[UInt] = Seq.tabulate(parameter.laneNumber) { _ => RegInit(0.U(parameter.datapathWidth.W)) } + val waiteReadSate: UInt = RegInit(0.U(parameter.laneNumber.W)) + val waiteReadStageValid: Bool = RegInit(false.B) + + // Process the data that needs to be written + val dlen: Int = parameter.datapathWidth * parameter.laneNumber + // Execute at most 4 times, each index represents 1/4 of dlen + val eachIndexSize = dlen / 4 + val writeDataVec = Seq(0, 1, 2).map { sewInt => + val dataByte = 1 << sewInt + val data = VecInit(Seq.tabulate(parameter.laneNumber) { laneIndex => + val dataElement: UInt = Wire(UInt((dataByte * 8).W)) + val dataIsRead = waiteReadDataPipeReg.needRead(laneIndex) + // todo: select vs1 when slide1 + dataElement := Mux(dataIsRead, waiteReadData(laneIndex), 0.U) + dataElement + }).asUInt + + val shifterData = (data << (waiteReadDataPipeReg.executeIndex ## 0.U(log2Ceil(eachIndexSize).W))).asUInt + // align + changeUIntSize(shifterData, dlen) + } + val writeData = Mux1H(sew1H, writeDataVec) + + val writeMaskVec: Seq[UInt] = Seq(0, 1, 2).map { sewInt => + val MaskMagnification = 1 << sewInt + val mask = FillInterleaved(MaskMagnification, waiteReadDataPipeReg.sourceValid) + val shifterMask = (mask << (waiteReadDataPipeReg.executeIndex ## 0.U(log2Ceil(eachIndexSize / 8).W))).asUInt + // align + changeUIntSize(shifterMask, dlen / 8) + } + val writeMask = Mux1H(sew1H, writeMaskVec) + + val writeRequest: Seq[MaskUnitExeResponse] = Seq.tabulate(parameter.laneNumber) { laneIndex => + val res: MaskUnitExeResponse = Wire(new MaskUnitExeResponse(parameter.laneParam)) + res.ffoByOther := false.B + res.index := instReg.instructionIndex + res.writeData.groupCounter := waiteReadDataPipeReg.groupCounter + res.writeData.data := cutUIntBySize(writeData, parameter.laneNumber)(laneIndex) + res.writeData.mask := cutUIntBySize(writeMask, parameter.laneNumber)(laneIndex) + res + } + val WillWriteLane: UInt = VecInit(cutUIntBySize(writeMask, parameter.laneNumber).map(_.orR)).asUInt + + // update waite read stage + val waiteStageDeqValid: Bool = + waiteReadStageValid && + (waiteReadSate === waiteReadDataPipeReg.needRead || waiteReadDataPipeReg.needRead === 0.U) + val waiteStageDeqReady: Bool = Wire(Bool()) + val waiteStageDeqFire: Bool = waiteStageDeqValid && waiteStageDeqReady + + val waiteStageEnqReady: Bool = !waiteReadStageValid || waiteStageDeqFire + val waiteStageEnqFire: Bool = readWaitQueue.io.deq.valid && waiteStageEnqReady + + readWaitQueue.io.deq.ready := waiteStageEnqReady + + when(waiteStageEnqFire) { + waiteReadDataPipeReg := readWaitQueue.io.deq.bits + } + + when(waiteStageDeqFire ^ waiteStageEnqFire) { + waiteReadStageValid := waiteStageEnqFire + } + + waiteReadData.zipWithIndex.foreach { case (reg, index) => + val isWaiteForThisData = waiteReadDataPipeReg.needRead(index) && !waiteReadSate(index) && waiteReadStageValid + val read = readData(index) + read.ready := isWaiteForThisData + when(read.fire) { + reg := read.bits + } + } + val readResultValid: UInt = VecInit(readData.map(_.fire)).asUInt + when(waiteStageEnqFire && readResultValid.orR) { + waiteReadSate := readResultValid + }.elsewhen(readResultValid.orR) { + waiteReadSate := waiteReadSate | readResultValid + }.elsewhen(waiteStageEnqFire) { + waiteReadSate := 0.U + } + + // Determine whether the data is ready + val executeEnqValid: Bool = waiteReadStageValid && waiteReadDataPipeReg.needRead === waiteReadSate + + // start execute + val compressUnit: MaskCompress = Module(new MaskCompress(parameter)) + val reduceUnit: MaskReduce = Module(new MaskReduce(parameter)) + val extendUnit: MaskExtend = Module(new MaskExtend(parameter)) + + // todo + val source2: UInt = VecInit(exeReqReg.map(_.bits.source2)).asUInt + val source1: UInt = VecInit(exeReqReg.map(_.bits.source1)).asUInt + + compressUnit.in.valid := executeEnqValid + compressUnit.in.bits.maskType := instReg.maskType + compressUnit.in.bits.eew := instReg.sew + compressUnit.in.bits.uop := instReg.decodeResult(Decoder.topUop) + compressUnit.in.bits.readFromScalar := instReg.readFromScala + compressUnit.in.bits.source1 := source1 + compressUnit.in.bits.source2 := source2 + compressUnit.in.bits.groupCounter := waiteReadDataPipeReg.groupCounter + compressUnit.in.bits.lastCompress := lastGroup + compressUnit.newInstruction := instReq.valid + + reduceUnit.in.valid := executeEnqValid && unitType(2) + reduceUnit.in.bits.maskType := instReg.maskType + reduceUnit.in.bits.eew := instReg.sew + reduceUnit.in.bits.uop := instReg.decodeResult(Decoder.topUop) + reduceUnit.in.bits.readVS1 := source1 + reduceUnit.in.bits.source2 := source2 + reduceUnit.in.bits.sourceValid := waiteReadDataPipeReg.sourceValid + reduceUnit.in.bits.groupCounter := waiteReadDataPipeReg.groupCounter + reduceUnit.in.bits.lastGroup := lastGroup + reduceUnit.in.bits.vxrm := instReg.vxrm + reduceUnit.in.bits.aluUop := instReg.decodeResult(Decoder.uop) + reduceUnit.in.bits.sign := !instReg.decodeResult(Decoder.unsigned1) + reduceUnit.newInstruction := instReq.valid + + extendUnit.in.eew := instReg.sew + extendUnit.in.uop := instReg.decodeResult(Decoder.topUop) + extendUnit.in.source2 := source2 + extendUnit.in.groupCounter := waiteReadDataPipeReg.groupCounter + + val executeResult = Mux1H( + unitType, + Seq( + source2, + compressUnit.out.data, + reduceUnit.out.bits.data, + extendUnit.out + ) + ) + + // todo + val executeMask: UInt = VecInit(exeReqReg.map(_.bits.source2)).asUInt + val executeDeqCount: UInt = waiteReadDataPipeReg.groupCounter + + // val executeValid = Mux1H( + // unitType, + // Seq( + // executeEnqValid, + // compressUnit.out.compressValid, + // reduceUnit.out.valid, + // executeEnqValid + // ) + // ) + val executeValid: Bool = Mux1H( + unitType(3, 1), + Seq( + compressUnit.out.compressValid, + reduceUnit.out.valid, + executeEnqValid + ) + ) + + // mask unit write queue + val writeQueue: Seq[Queue[MaskUnitExeResponse]] = Seq.tabulate(parameter.laneNumber) { _ => + Module( + new Queue( + new MaskUnitExeResponse(parameter.laneParam), + maskUnitWriteQueueSize + ) + ) + } + + writeQueue.zipWithIndex.foreach { case (queue, index) => + val readTypeWriteVrf: Bool = waiteStageDeqFire && WillWriteLane(index) + queue.io.enq.valid := executeValid || readTypeWriteVrf + queue.io.enq.bits.writeData.data := cutUInt(executeResult, parameter.datapathWidth)(index) + queue.io.enq.bits.writeData.mask := cutUInt(executeMask, parameter.datapathWidth / 8)(index) + queue.io.enq.bits.writeData.groupCounter := executeDeqCount + queue.io.enq.bits.ffoByOther := false.B // todo + queue.io.enq.bits.index := instReg.instructionIndex + when(readTypeWriteVrf) { + queue.io.enq.bits := writeRequest(index) + } + + // write vrf + val writePort = exeResp(index) + queue.io.deq.ready := true.B + writePort.valid := queue.io.deq.valid + writePort.bits := queue.io.deq.bits + } + waiteStageDeqReady := writeQueue.zipWithIndex.map { case (queue, index) => + !WillWriteLane(index) || queue.io.enq.ready + }.reduce(_ && _) + writeRD <> DontCare + + // todo: token + val waitQueueClear: Bool = RegInit(false.B) + val lastReportValid = waitQueueClear && !writeQueue.map(_.io.deq.valid).reduce(_ || _) + when(lastReportValid) { + waitQueueClear := false.B + } + when(waiteStageDeqFire && waiteReadDataPipeReg.last) { + waitQueueClear := true.B + } + lastReport := maskAnd( + lastReportValid, + indexToOH(instReg.instructionIndex, parameter.chainingSize) + ) +} diff --git a/t1/src/mask/MaskUnitReadCrossBar.scala b/t1/src/mask/MaskUnitReadCrossBar.scala new file mode 100644 index 000000000..dab845d9e --- /dev/null +++ b/t1/src/mask/MaskUnitReadCrossBar.scala @@ -0,0 +1,48 @@ +// SPDX-License-Identifier: Apache-2.0 +// SPDX-FileCopyrightText: 2022 Jiuyang Liu + +package org.chipsalliance.t1.rtl + +import chisel3._ +import chisel3.util._ + +class MaskUnitReadCrossBar(parameter: T1Parameter) extends Module { + val input: Seq[DecoupledIO[MaskUnitReadReq]] = Seq.tabulate(parameter.laneNumber)(_ => + IO( + Flipped( + Decoupled( + new MaskUnitReadReq(parameter) + ) + ) + ) + ) + val output: Seq[DecoupledIO[MaskUnitReadQueue]] = Seq.tabulate(parameter.laneNumber)(_ => + IO( + Decoupled( + new MaskUnitReadQueue(parameter) + ) + ) + ) + + val inputSelect1H: Vec[UInt] = Wire(Vec(parameter.laneNumber, UInt(parameter.laneNumber.W))) + + input.zipWithIndex.foldLeft(0.U(parameter.laneNumber.W)) { case (laneOccupied, (req, index)) => + val requestReadLane = UIntToOH(req.bits.readLane) + // read lane free + val free: Bool = (requestReadLane & (~laneOccupied).asUInt).orR + val outReady: Bool = Mux1H(requestReadLane, output.map(_.ready)) + req.ready := free && outReady + inputSelect1H(index) := Mux(req.valid && free, requestReadLane, 0.U(parameter.laneNumber.W)) + laneOccupied | inputSelect1H(index) + } + + output.zipWithIndex.foreach { case (req, index) => + val tryToRead: UInt = VecInit(inputSelect1H.map(_(index))).asUInt + req.valid := tryToRead.orR + val selectReq: DecoupledIO[MaskUnitReadReq] = Mux1H(tryToRead, input) + req.bits.vs := selectReq.bits.vs + req.bits.offset := selectReq.bits.offset + req.bits.writeIndex := selectReq.bits.requestIndex + req.bits.dataOffset := selectReq.bits.dataOffset + } +} diff --git a/t1/src/package.scala b/t1/src/package.scala index b0afc12be..ba1519b6b 100644 --- a/t1/src/package.scala +++ b/t1/src/package.scala @@ -10,6 +10,8 @@ import chisel3.util.experimental.decode.DecodeBundle import org.chipsalliance.t1.rtl.decoder.{Decoder, TableGenerator} import org.chipsalliance.t1.rtl.lane.Distributor +import scala.jdk.CollectionConverters._ + package object rtl { def csa32(s: UInt, c: UInt, a: UInt): (UInt, UInt) = { val xor = s ^ c @@ -74,6 +76,30 @@ package object rtl { }) } + def cutUIntBySize(data: UInt, size: Int): Vec[UInt] = { + require(data.getWidth % size == 0) + val width: Int = data.getWidth / size + cutUInt(data, width) + } + + def changeUIntSize(data: UInt, size: Int, sign: Boolean = false): UInt = { + if (data.getWidth >= size) { + data(size - 1, 0) + } else { + val extend = if (sign) data(data.getWidth - 1) else false.B + Fill(size - data.getWidth, extend) ## data + } + } + + def UIntWithSize(data: UInt, width: Int, signExtend: Boolean = false): UInt = { + val sign = if (signExtend) data(data.getWidth - 1) else false.B + if (data.getWidth > width) { + data(width - 1, 0) + } else { + Fill(width - data.getWidth, sign) ## data + } + } + def calculateSegmentWriteMask( datapath: Int, laneNumber: Int, diff --git a/t1/src/sequencer/T1TokenManager.scala b/t1/src/sequencer/T1TokenManager.scala index b80ed2dcd..de19e9be0 100644 --- a/t1/src/sequencer/T1TokenManager.scala +++ b/t1/src/sequencer/T1TokenManager.scala @@ -7,10 +7,20 @@ import chisel3._ import chisel3.experimental.hierarchy.{instantiable, public} import chisel3.util._ +class IssueToken(parameter: T1Parameter) extends Bundle { + val instructionIndex: UInt = UInt(parameter.instructionIndexBits.W) + val writeV0: Bool = Bool() + val useV0AsMask: Bool = Bool() + val isLoadStore: Bool = Bool() +} + @instantiable class T1TokenManager(parameter: T1Parameter) extends Module { @public - val writeV0 = IO(Vec(parameter.laneNumber, Flipped(Valid(UInt(parameter.instructionIndexBits.W))))) + val instructionIssue: ValidIO[IssueToken] = IO(Flipped(Valid(new IssueToken(parameter)))) + + @public + val issueAllow: Bool = IO(Output(Bool())) @public val instructionFinish: Vec[UInt] = IO(Vec(parameter.laneNumber, Input(UInt(parameter.chainingSize.W)))) @@ -18,19 +28,42 @@ class T1TokenManager(parameter: T1Parameter) extends Module { @public val v0WriteValid = IO(Output(UInt(parameter.chainingSize.W))) - // v0 write token - val v0WriteValidVec: Seq[UInt] = Seq.tabulate(parameter.laneNumber) { laneIndex => - val update: ValidIO[UInt] = writeV0(laneIndex) - val clear: UInt = instructionFinish(laneIndex) - val updateOH = maskAnd(update.valid, indexToOH(update.bits, parameter.chainingSize)).asUInt + val issueIndex1H: UInt = indexToOH(instructionIssue.bits.instructionIndex, parameter.chainingSize) + + // Boolean type token clear & set + def updateBooleanToken(set: UInt, clear: UInt): UInt = { VecInit(Seq.tabulate(parameter.chainingSize) { chainingIndex => val res = RegInit(false.B) - when(updateOH(chainingIndex) || clear(chainingIndex)) { - res := updateOH(chainingIndex) + when(set(chainingIndex) || clear(chainingIndex)) { + res := set(chainingIndex) } res }).asUInt } + // v0 write token + val v0WriteValidVec: Seq[UInt] = Seq.tabulate(parameter.laneNumber) { laneIndex => + val v0WriteIssue = instructionIssue.valid && instructionIssue.bits.writeV0 + val clear: UInt = instructionFinish(laneIndex) + val updateOH = maskAnd(v0WriteIssue, issueIndex1H).asUInt + updateBooleanToken(updateOH, clear) + } + + val useV0AsMaskToken: UInt = Seq + .tabulate(parameter.laneNumber) { laneIndex => + val useV0Issue = instructionIssue.valid && instructionIssue.bits.useV0AsMask + val clear: UInt = instructionFinish(laneIndex) + val updateOH = maskAnd(useV0Issue, issueIndex1H).asUInt + updateBooleanToken(updateOH, clear) + } + .reduce(_ | _) + v0WriteValid := v0WriteValidVec.reduce(_ | _) + + // v0 read-write conflict + val v0Conflict: Bool = + (instructionIssue.bits.writeV0 && useV0AsMaskToken.orR) || + (instructionIssue.bits.useV0AsMask && v0WriteValid.orR) + + issueAllow := !(v0Conflict) } diff --git a/t1/src/vrf/VRF.scala b/t1/src/vrf/VRF.scala index eaccfee70..fa676c6fc 100644 --- a/t1/src/vrf/VRF.scala +++ b/t1/src/vrf/VRF.scala @@ -529,7 +529,11 @@ class VRF(val parameter: VRFParam) extends Module with SerializableModule[VRFPar val dataInLsuQueue = ohCheck(loadDataInLSUWriteQueue, record.bits.instIndex, parameter.chainingSize) // elementMask update by write val writeUpdateValidVec: Seq[Bool] = - writePort.map(p => p.fire && p.bits.instructionIndex === record.bits.instIndex && p.bits.mask(3)) + writePort.map(p => + p.fire && p.bits.instructionIndex === record.bits.instIndex && + // Only index load will split the datapath into separate parts. + (p.bits.mask(3) || !record.bits.ls) + ) val writeUpdate1HVec: Seq[UInt] = writeOH.zip(writeUpdateValidVec).map { case (oh, v) => Mux(v, oh, 0.U) } // elementMask update by read of store instruction val loadUpdateValidVec = @@ -545,7 +549,7 @@ class VRF(val parameter: VRFParam) extends Module with SerializableModule[VRFPar val waitLaneClear = record.bits.state.stFinish && record.bits.state.wWriteQueueClear && record.bits.state.wLaneLastReport && record.bits.state.wTopLastReport - val stateClear: Bool = waitLaneClear && record.bits.state.wLaneClear + val stateClear: Bool = waitLaneClear && record.bits.state.wLaneClear || record.bits.elementMask.andR when(topLastReport) { record.bits.state.stFinish := true.B @@ -607,7 +611,8 @@ class VRF(val parameter: VRFParam) extends Module with SerializableModule[VRFPar Mux(older, sourceVdEqSinkVs, sinkVdEqSourceVs) ) val rawForeStore = Mux(older, isStore.head && isSlow.last, isStore.last && isSlow.head) && samVd - (hazardForeLoad, rawForeStore) + // (hazardForeLoad, rawForeStore) todo: need check hazard? + (false.B, false.B) } } writeReadyForLsu := !hazardVec.map(_.map(_._1).reduce(_ || _)).reduce(_ || _)