//------------------------------------------------------------------------------
// ldpcDecOp.v
// 
// Description:
//   Fetches data from the output buffer && delivers it to the external
//   downstream module. When clrToSend is high data will be presented along
//   with a strobe. Data is fetched from the memory once decodeComplete;
//   received.
//
//   OP_CONCAT mode causes the last output reg word not to be output when it is not
//   fully filled as a result of the amount of output data not being divisible
//   by `OP_WIDTH. Instead the part word is held in collection register to be
//   concatentated with the data from the next LDPC block. With the data held
//   in the collection register, the HD buffer can be freed for use by the
//   decoder.
//
// Inputs:
//   enable        : The enable register bit.
//   zEnum         : Indicates which Z size is being used (from `Z_SIZES2) list.
//   k2            : Actual number of output bits (shortening removed).
//   cycLast       : Cyclical shifts that apply to the data in the memory. These
//                   need removal when outputted.
//   clrToSend     : Downstream HW ready to accept output.
//   opBufferData  : Read data from HD mem.
//   decodeComplete: Pulse at the end of a decode. Note, this is delayed until
//                   opBufferLocked is low.
//   newBlk        : Generally this is the start of a new decode, but we use
//                   this signal specifically because it is synchronous to the
//                   last update of cycLast, which needs to be sampled.
// Outputs:
//   opStrobe      : When high a valid output sample is presented. This;
//                   deemed to have been accepted if clrToSend is high on
//                   the following clock edge at which point a new sample
//                   will be presented.
//   opDataWord    : 8 bits of the output data, presented in the same order
//                   that the data was received, lsb of first word being the
//                   first bit in time.
//   hdRamSel      : Select the HD RAM for read.
//   hdRamAddr     : The address for the HD RAM.
//   opBufferLocked: When high, this indicates that the output data for decode
//                   n has not been fully read. If the decoder completes decode
//                   n+1 it will wait for this to drop before proceeding with
//                   decode n+2.                
//
// 13 Jul 2010 M. Rumsey. Created.
// 27 Sep 2010 M. Rumsey. Added pOpBuffer register stage for timing improvement.
// 25 Apr 2011 M. Rumsey. Collection register mechanism replaced by that from
//                        ldpcDec, in order to get signal output widths.
//
// (c) Copyright 2010, Blue Rum Consulting Limited, All Rights Reserved.
//------------------------------------------------------------------------------

`include "ldpcDec.vh"

module ldpcDecOp
  (  
     input                                 nReset,
     input                                 opClk,
     input                                 enable,
     input                                 BlockEnable,
     input                                 disabling,
     // Fm Regs
     input [numBits(`LDEC_Z_ENUM_MAX)-1:0] zEnum,
     input [15:0]                          packetBytesLs,
     input [`LDEC_PB_MS_LEFT:0]            packetBytesMs, 
     input [numBits(`LDEC_NCOLS-1)-1:0]    parityStartCol,
     // Fm Top
     input [numBits(`LDEC_K_MAX)-1:0]      k2,
     input [numBits(`LDEC_Z_MAX)-1:0]      z,
     input `LDEC_cycShift_P                cycLastP, 
     input                                 clrToSend, 
     // Fm vmMem
     input `LDEC_sgnZType                  hdRamData,
     // Fm Ctrl
     input                                 decodeComplete,
     input                                 newBlk,
     input                                 packetComplete,
     // to Top
     output                                opStrobeOut,
     output [`LDEC_OP_WIDTH-1:0]           opDataWordOut,
     output reg [`LDEC_OP_WIDTH/8-1:0]     opByteEnOut, 
     output reg [numBits(`LDEC_OP_WIDTH)-1:0] opBitEnOut, 
     // To vmMem
     output                                hdRamSelOut,
     output [`LDEC_VM_RAM_A-1:0]           hdRamAddrOut, 
     // To Ctrl
     output                                opBufferLockedOut,
     output                                opClkEnOut,
     output                                opDoneOut,
     output reg                            blockOpCompleteOut, // 1 cycle after opDone
     output reg                            packetOpCompleteOut
     ); 

`include "ldpcDecFuncs.vh"

  // OP_RDZ is the number of rotate stages that are pushed into the
  // memory fetch cycle.  
  localparam OP_RDZ   = 2;
  localparam MFS      = `LDEC_MEM_FETCH_SIZE;
  localparam CR_LEN   = 2*MFS+`LDEC_OP_WIDTH;
  localparam CRB_MAX  = CR_LEN + `LDEC_OP_WIDTH;
  localparam CRB_HI   = numBits(CRB_MAX)-1;
  localparam PKB_HI   = numBits(`LDEC_MAX_PACKET_BITS-1)-1;
  localparam Z_HI     = numBits(`LDEC_Z_MAX)-1; 
  localparam Z1_HI    = numBits(`LDEC_Z_MAX-1)-1;
  localparam K_HI     = numBits(`LDEC_K_MAX)-1;
  localparam N_HI     = numBits(`LDEC_N_MAX)-1;
  localparam NOB_HI   = numBits(`LDEC_K_MAX + 2*`LDEC_OP_WIDTH)-1;
  localparam OB_HI    = `LDEC_OP_WIDTH/8-1;
  localparam RAM_HI   = `LDEC_VM_RAM_A-1;
  localparam NSF_HI   = numBits(MFS)-1;
  localparam CYC_BITS = numBits(`LDEC_Z_MAX-1);

  localparam [Z_HI:0] MFSZ = MFS;
  localparam [N_HI:0] MFSN = MFS;
  localparam [NSF_HI:0] MFSNSF = MFS;
  localparam [CRB_HI:0] CR_LIM = CR_LEN-1 - MFS + 1;
  localparam [CRB_HI:0] OP_WIDTH_CRB = `LDEC_OP_WIDTH;
  localparam [Z_HI:0] OP_WIDTH_Z = `LDEC_OP_WIDTH;
  localparam [NOB_HI:0] OP_WIDTH_NOB = `LDEC_OP_WIDTH;
  localparam [CRB_HI:0] CR_LEN_MFS = CR_LEN - MFS;
    
  // Signals for correcting cyclical shift in output
  wire                                   opClkEn;
  wire `LDEC_cycShift_L                  cycLast       `LDEC_cycShift_R;  
  reg  `LDEC_cycShift_L                  cycLastReg    `LDEC_cycShift_R;  
  reg  `LDEC_cycShift_L                  cycLastReg1   `LDEC_cycShift_R;  
  wire `LDEC_sgnZType                    shifted;
  reg                                    waitForDecode;
  reg                                    opStrobe;
  reg `LDEC_sgnZType                     hdBufferData;
  reg                                    updateHdBuffer;
  reg                                    opBufferLocked;
  
  reg [Z1_HI:0]                          leftShift;  
  reg [Z1_HI:0]                          leftShift_1;  
  reg [K_HI:0]                           k2Reg;
  reg [K_HI:0]                           k2Reg1;
  reg                                    decodeCompleteD1;
  reg                                    decodeCompleteD2;
  
  wire                                   hdRamSel;
  reg                                    hdRamSelForce;
  reg [RAM_HI:0]                         hdRamAddr;

  // Signals for Collection register management.


  reg [CR_LEN-1:0]                       collectionReg;
  reg                                    collectEnable;
  reg                                    outputEnable;

  
  reg [CRB_HI:0]                         colRegBase;
  reg [CRB_HI:0]                         colRegBaseSnapshot;
  reg [MUX_HI:0]                         memMux;
  reg [MFS-1:0]                          memDataWord;
  reg [N_HI:0]                           numSamplesToCollect;
  reg [NSF_HI:0]                         numSamplesToFetch;
  reg [Z_HI:0]                           numSamplesFromRow;
  reg [NOB_HI:0]                         numberOfBits;
  reg                                    colRegPrimed;
  reg                                    lastCollectDone;
  reg                                    firstCollect;
  reg                                    opDone;
  reg                                    firstOutput;
  reg                                    lastBlock;
  reg                                    opBufferLockedD1;
  wire [PKB_HI:0]                        packetBits;
  reg  [PKB_HI:0]                        packetDataBitCount;
  
  genvar                                idx1;

//  // Update a section of a word. 
//  function [CR_LEN-1:0] updatePart(input [CR_LEN-1:0]           word,
//                                   input [CRB_HI:0] base,
//                                   input [MFS-1:0]              update);
//    localparam highestBase = $size(word) - $size(update);
//    reg [CR_LEN-1:0] result;  
//  begin
//    // Pass the input through to the output to set default values
//    result = word;
//    // Need to loop through all possibilities to make the required muxing exist
//    // and enable just the case that is currently enabled.
//    for (int b=0; b<=highestBase; b=b+1) begin
//      if (b[CRB_HI:0] == base) begin
//        result[b+:MFS] = update;
//      end
//    end
//    return result;
//  end
//  endfunction // updatePart
  `LDEC_UNPACK(gCycLast, cycLastP, cycLast, CYC_BITS, `LDEC_NCOLS)

  assign packetBits  = {packetBytesMs, packetBytesLs, 3'b000};

  // Capture the residual shifts that apply to the output data memory.
  always @(posedge(opClk) `LDEC_RESET_STR)
  begin : pCapture
    
    reg  caughtNewBlkV;
    reg  caughtNewBlk2V;
    reg  firstNewBlkV;
    integer i;
    
    if (nReset == 1'b0) begin
      firstNewBlkV = 1'b1;
      opBufferLockedD1 <= 1'b1;
      caughtNewBlkV = 1'b0;
      caughtNewBlk2V = 1'b0;
      decodeCompleteD1 <= 1'b0;
      decodeCompleteD2 <= 1'b0;
      lastBlock <= 1'b0;
      //if (`LDEC_RESET_ALL) begin (different from VHDL)
      packetDataBitCount <= `LDEC_PAD(1'b0, PKB_HI);
      for (i=0; i< `LDEC_NCOLS; i = i+1) begin
        cycLastReg[i] <= `LDEC_PAD(1'b0, Z_HI);
        cycLastReg1[i] <= `LDEC_PAD(1'b0, Z_HI);
      end
      k2Reg <= `LDEC_PAD(1'b0, K_HI);
      k2Reg1 <= `LDEC_PAD(1'b0, K_HI);
      //end
    end else begin
      if (opClkEn == 1'b1 || `LDEC_CLK_GATING == 1) begin
        opBufferLockedD1 <= opBufferLocked;
        // If previous block has completed must capture the cyclical shifts
        // with precise timing. Double buffering is used in case a decode
        // completes while we are still outputting the previous block.
        if (!opBufferLocked && caughtNewBlkV) begin
          for (i=0; i< `LDEC_NCOLS; i = i+1) begin
            cycLastReg[i] <= cycLastReg1[i];
          end
          k2Reg <= k2Reg1;
          caughtNewBlkV = 1'b0;
        end
        // Similar to above, but we need a cycle delay to catch packetComplete
        // after opBufferLocked is released.
        if (!opBufferLockedD1 && caughtNewBlk2V) begin
          lastBlock <= packetComplete;
          caughtNewBlk2V = 1'b0;          
          if (packetComplete && (packetBits > `LDEC_PAD(1'b0, PKB_HI))) begin
            // override k2 based on actual packet length.
            k2Reg <= packetDataBitCount[K_HI:0]; 
          end        
        end
        if (newBlk && !caughtNewBlkV) begin
          for (i=0; i< `LDEC_NCOLS; i = i+1) begin
            cycLastReg1[i] <= cycLast[i];
          end
          k2Reg1 <= k2;
          caughtNewBlkV = 1'b1;
          caughtNewBlk2V = 1'b1;
          if (firstNewBlkV) begin            
            packetDataBitCount <= packetBits;
            firstNewBlkV = 1'b0;
          end          
        end

        // Note: opDone extends if clrToSend is low hence anding below to prevent
        // multiple decrement.
        if (opDone && clrToSend && (packetBits > `LDEC_PAD(1'b0, PKB_HI)) && !lastBlock) begin
          // This is for padding byte removal.
          packetDataBitCount <= packetDataBitCount - `LDEC_PAD(k2Reg, PKB_HI-K_HI);
        end
        
        decodeCompleteD1 <= 1'b0;
        if (decodeComplete) begin
          decodeCompleteD1 <= 1'b1;
        end
        decodeCompleteD2 <= decodeCompleteD1;
      end
      if (!BlockEnable) begin        
        firstNewBlkV = 1'b1;
        opBufferLockedD1 <= 1'b1;
        caughtNewBlkV = 1'b0;
        caughtNewBlk2V = 1'b0;
        decodeCompleteD1 <= 1'b0;
        decodeCompleteD2 <= 1'b0;
        lastBlock <= 1'b0;
      end
    end   
  end //pCapture

  //---------------------------------------------------------------------------
  // Fetch RAM data and apply cyclical shifts
  //---------------------------------------------------------------------------
  
  // When updateHdBuffer==1 pOp is expecting 'shifted' to update on the next
  // clock cycle. We do this using the current data held on the RAM output.
  // On the following cycle we fetch new RAM data. An initial fetch must also
  // be performed at the start of the output.

  always @(posedge(opClk) `LDEC_RESET_STR)
  begin : pOpBuffer
    
    reg  [RAM_HI:0] hdRamAddrV;
    
    if (nReset == 1'b0) begin
      hdRamSelForce <= 1'b0;
      hdRamAddr <= `LDEC_PAD(1'b0, RAM_HI);
      //if (`LDEC_RESET_ALL) begin (different from VHDL)
      leftShift_1 <= `LDEC_PAD(1'b0, Z1_HI);
      leftShift <= `LDEC_PAD(1'b0, Z1_HI);
      hdBufferData <= `LDEC_PAD(1'b0, `LDEC_Z_MAX-1);        
      //end
    end else begin      
      if (opClkEn == 1'b1 || `LDEC_CLK_GATING == 1) begin        
  
        hdRamSelForce <= 1'b0; 
        hdRamAddrV = hdRamAddr;
        
        // Do a pre-fetch of data from the RAM.
        if (decodeCompleteD1 == 1'b1) begin         
          hdRamSelForce <= 1'b1;
          hdRamAddrV =  `LDEC_PAD(1'b0, RAM_HI);
        end
        // If the RAM has been read, move on the address so that the select
        // can be generated as-needed (combinatorially off updateHdBuffer).
        if (hdRamSel == 1'b1 && hdRamAddr < parityStartCol) begin
          hdRamAddrV = hdRamAddr + `LDEC_PAD(1'b1, RAM_HI);
          // Provide the cyc shift info synchronously to the fetched HDs.
          leftShift_1 <= modZ(`LDEC_PADS(1'b0, Z1_HI+1) -
                              $signed({1'b0, cycLastReg[hdRamAddr]}), Z_SIZES2(zEnum));
        end
        // Register the RAM data in hdBufferData.
        if (updateHdBuffer == 1'b1 && clrToSend == 1'b1) begin
          // Can now see the RAM data. Register it. And do part of the shifting
          // [some shifting is done after the registering:improve timing].
          hdBufferData <= ldpcDecRotateDownZ1(hdRamData, zEnum, OP_RDZ, 0, leftShift_1);
          // Provide a delayed leftShift for the second part of the shift.
          leftShift <= leftShift_1;
        end
        hdRamAddr <= hdRamAddrV;
      end
      if (BlockEnable == 1'b0) begin
        hdRamSelForce <= 1'b0;  
      end
    end
  end //pOpBuffer

  // Fetch from the HD Ram whenever the HD Ram data is registered in hdBuffer.
  assign hdRamSel  = hdRamSelForce | (updateHdBuffer & clrToSend);

  // Complete the cyclical shift (upper bits of the shift amount).
  assign shifted  = ldpcDecRotateDownZ1(hdBufferData, zEnum, Z_HI, 
                                OP_RDZ+1, leftShift);

  //---------------------------------------------------------------------------
  // Manage Collection Register
  //---------------------------------------------------------------------------
  
  always @(posedge(opClk) `LDEC_RESET_STR)
  begin : pCtrl

    reg [CRB_HI:0]                   colRegBaseV;
    reg [N_HI:0]                     numSamplesToCollectV;
    reg [NSF_HI:0]                   numSamplesToFetchV;
    reg [Z_HI:0]                     numSamplesFromRowV;
    reg                              lastCollectDoneV;
    reg                              collectConfirmedV; 
    reg                              incrAddrV;
    reg [Z_HI:0]                     zV;
    
    if (nReset == 1'b0) begin
      updateHdBuffer <= 1'b0;
      outputEnable <= 1'b0;
      opBufferLocked <= 1'b0;
      waitForDecode <= 1'b1;
      colRegBase <= `LDEC_PAD(1'b0, CRB_HI);
      colRegPrimed <=1'b0;
      collectEnable <= 1'b0;
      firstCollect <= 1'b1;
      // RAM addressing
      memMux <= `LDEC_PAD(1'b0, MUX_HI);
      lastCollectDone <= 1'b0;
      //if (`LDEC_RESET_ALL) begin (different from VHDL)
      numSamplesToCollect <= `LDEC_PAD(1'b0, N_HI);
      numSamplesToFetch <= `LDEC_PAD(1'b0, NSF_HI);
      numSamplesFromRow <= `LDEC_PAD(1'b0, Z_HI);
      //end
    end else begin
      if (opClkEn == 1'b1) begin
        if (enable == 1'b0 || (opDone == 1'b1 && ! `LDEC_CONCAT_OP)) begin
          // We only reset the pointer in the collection register at the start
          // of packet. This allows a partial output word from the previous block
          // to be concatenated with data from the following decode block.
          colRegBase <= `LDEC_PAD(1'b0, CRB_HI);
        end
        if (BlockEnable == 1'b0 || opDone == 1'b1) begin
          updateHdBuffer <= 1'b0;
          outputEnable <= 1'b0;
          opBufferLocked <= 1'b0;
          waitForDecode <= 1'b1;
          colRegPrimed <=1'b0;
          collectEnable <= 1'b0;       
          firstCollect <= 1'b1;
          numSamplesToCollect <= `LDEC_PAD(1'b0, N_HI);
          numSamplesToFetch <= `LDEC_PAD(1'b0, NSF_HI);
          numSamplesFromRow <= `LDEC_PAD(1'b0, Z_HI);
          // RAM addressing
          memMux <=  `LDEC_PAD(1'b0, MUX_HI);
          lastCollectDone <= 1'b0;
  
        end
        else begin
  
          lastCollectDoneV = 1'b0;
          
          if (waitForDecode == 1'b1) begin
            if (decodeCompleteD2 == 1'b1) begin
              // Move into output mode
              opBufferLocked <= 1'b1;
              waitForDecode <= 1'b0; 
              updateHdBuffer <= 1'b1; 
            end
          end
          else if (clrToSend == 1'b1) begin
  
            //-------------------------------------------------------------------
            // Whether to Output
            //-------------------------------------------------------------------
  
            // If the Op is wider than the memory data width begin the output;
            // ! going to be continuous. Arrange for a 2x slow down of the Op.
            if (z < OP_WIDTH_Z) begin
              outputEnable <= !outputEnable & colRegPrimed;
            end
            else begin
              outputEnable <= colRegPrimed;
            end
            
            //-------------------------------------------------------------------
            // Whether to enable the collection register.
            //-------------------------------------------------------------------
            // Work out if there is enough space in the collection register if
            // we enable a collect.
            // Note: N_HI >  NSF_HI > Z_HI 
            numSamplesToCollectV = `LDEC_PAD(1'b0, N_HI);
            numSamplesToFetchV = `LDEC_PAD(1'b0, NSF_HI);
            numSamplesFromRowV = `LDEC_PAD(1'b0, Z_HI);
            collectConfirmedV = 1'b0;
            if (lastCollectDone == 1'b0) begin
              // Limit num fetched samples at end of a Z row.
              if (numSamplesFromRow >= z - MFSZ) begin
                zV = z - numSamplesFromRow;
                numSamplesToFetchV = zV[NSF_HI:0];                
                numSamplesToCollectV = numSamplesToCollect +
                                       `LDEC_PAD(numSamplesToFetchV, N_HI-NSF_HI);
                numSamplesFromRowV = `LDEC_PAD(1'b0, Z_HI);
                                                 
              end
              else begin
                numSamplesToFetchV = MFSNSF;
                numSamplesToCollectV = numSamplesToCollect + MFSN;
                numSamplesFromRowV = numSamplesFromRow + MFSZ;
              end
              // But if we have read into the shortened region skip shortening.
              if (numSamplesToCollectV >= k2Reg) begin
                lastCollectDoneV = 1'b1;   // register later
                numSamplesToCollectV = k2Reg;
                numSamplesToFetchV = k2Reg - numSamplesToCollect;
                numSamplesFromRowV = `LDEC_PAD(1'b0, Z_HI);  // parity is on new row.
              end
              // colRegBase is where the data will be put. To a test increment
              // of this to see if there is space.
              if (firstCollect == 1'b1) begin
                collectConfirmedV = 1'b1;
                // leave colRegBase at 0.
              end
              else begin
                // Here we assume MFS samples (not numSamplesToCollectV) will
                // be loaded. This simplification reduces the complexity of
                // the collection process.
                colRegBaseV = colRegBase;
                //report "col reg base A" & integer'image(colRegBaseV);
                if (collectEnable == 1'b1) begin
                  colRegBaseV = colRegBaseV + `LDEC_PAD(numSamplesToFetch, CRB_HI-NSF_HI); // What's being loaded now.
                  //report "col reg base B" & integer'image(colRegBaseV);
                end
                if (outputEnable == 1'b1) begin  // delays op to 'stock' the buffer
                  colRegBaseV = colRegBaseV - OP_WIDTH_CRB;
                  //report "col reg base C" & integer'image(colRegBaseV);
                end
                // What if we load more?
                if (colRegBaseV <= CR_LIM) begin
                  // Allow collection
                  collectConfirmedV = !lastCollectDone;
                end
              end             
            end
            
            if (collectConfirmedV == 1'b1) begin              
              collectEnable <= 1'b1;
              numSamplesToCollect <= numSamplesToCollectV;
              numSamplesToFetch <= numSamplesToFetchV;
              numSamplesFromRow <= numSamplesFromRowV;
            end
            else begin
              collectEnable <= 1'b0;
              numSamplesToFetch <= `LDEC_PAD(1'b0, NSF_HI);
            end
            if (updateHdBuffer == 1'b1) begin            
              firstCollect <= 1'b0;
            end
            //---------------------------------------------------------------
            // Counters synchronous to the fetch
            //---------------------------------------------------------------
            //if collectEnable == 1'b1 begin
            if (collectConfirmedV == 1'b1) begin
              // Allow the collection reg to get more than one sample so it can
              // handle a small amount of data being fetched at the transition
              // past the shortened region.
              colRegPrimed <= 1'b1;
            end
            //-------------------------------------------------------------------
            // Collection register base.
            //-------------------------------------------------------------------
            // Where collected data goes (colRegBase) on the next cycle
            // (colRegBase), depends on the previous base, how much data was
            // previously collected and how much was output.
            colRegBaseV = colRegBase;
            if (collectEnable == 1'b1) begin
              colRegBaseV = colRegBaseV + `LDEC_PAD(numSamplesToFetch, CRB_HI-NSF_HI);
            end
            // Decrement colRegBase if there is to be an output sample. Generally
            // this is signified by output regEnable but if the LDPC block does not
            // fill a whole number of output reg words begin the output will be held
            // over till the next block.
            if (outputEnable == 1'b1 && colRegBaseV >= OP_WIDTH_CRB) begin 
              colRegBaseV = colRegBaseV - OP_WIDTH_CRB;
            end
            colRegBase <= colRegBaseV;
  
            //--------------------------------------------------------------------
            // Manage RAM de-muxing
            //--------------------------------------------------------------------
  
            // memMux is used combinatorially to select sections of the
            // RAM data captured on the shifted register.
            
            if (collectEnable == 1'b1) begin                                          
              if ((memMux == MEM_MUX_MAX(zEnum)) || (lastCollectDone == 1'b1)) begin
                memMux <= `LDEC_PAD(1'b0, MUX_HI);
              end
              else begin
                memMux <= memMux + `LDEC_PAD(1'b1, MUX_HI);
              end                                 
            end                     // collect enable
            updateHdBuffer <= 1'b0;     
            if (collectConfirmedV == 1'b1) begin
              
              incrAddrV = 1'b0;
              if (MEM_MUX_MAX(zEnum) == `LDEC_PAD(1'b0, MUX_HI)) begin
                incrAddrV = 1'b1;
              end
              else begin
                if  // Normal case, detect cycle before last mem mux value
                  ((memMux == (MEM_MUX_MAX(zEnum)-`LDEC_PAD(1'b1, MUX_HI)) && collectEnable == 1'b1) ||
                            // Case where previous cycle was a no collect
                            (memMux == MEM_MUX_MAX(zEnum) && collectEnable == 1'b0))
                begin
                  incrAddrV = 1'b1;
                end
              end      
              if (incrAddrV) begin
                updateHdBuffer <= 1'b1;              
              end
              lastCollectDone <= lastCollectDoneV;
                
            end  // collect confirmed
          end  // else waitForDecode
        end  // if (enable == 1'b1 ...
      end  // if (opClkEn == 1'b1)
    end    

  end //pCtrl
    
  //-------------------------------------------------------------------
  // Indicate end of block processing
  //-------------------------------------------------------------------
  always @(posedge(opClk) `LDEC_RESET_STR)
  begin : pDone
    if (nReset == 1'b0) begin
      blockOpCompleteOut <= 1'b0;
    end else begin
      if (opClkEn == 1'b1) begin
        if (BlockEnable == 1'b0) begin
          blockOpCompleteOut <= 1'b0;
        end else begin
          blockOpCompleteOut <= opDone;
        end  
      end
    end
  end    

  //-------------------------------------------------------------------
  // Load collection register
  //-------------------------------------------------------------------
  always @(posedge(opClk) `LDEC_RESET_STR)
  begin : pCollect
    
    reg [CR_LEN-1:0]  collectionRegV;
    reg [NOB_HI:0]    numberOfBitsV;
    reg               lastOutputV;
    reg               opStrobeV;
    integer           bitIdx;
    
    if (nReset == 1'b0) begin
      packetOpCompleteOut <= 1'b0;
      opDone <= 1'b0;
      opStrobe <= 1'b0;
      opByteEnOut <= {OB_HI+1 {1'b0}};
      numberOfBits <= `LDEC_PAD(1'b0, NOB_HI);
      firstOutput <= 1'b0;
      //if (`LDEC_RESET_ALL) begin (different from VHDL)
      collectionReg <= `LDEC_PAD(1'b0, CR_LEN-1);
      colRegBaseSnapshot <=  `LDEC_PAD(1'b0, CRB_HI);
      //end
      opBitEnOut <= 'd0;
    end else begin
      if (opClkEn == 1'b1) begin
        if (BlockEnable == 1'b0) begin
          opDone <= 1'b0;
          packetOpCompleteOut <= 1'b0;
          opStrobe <= 1'b0;
          opByteEnOut <= {OB_HI+1 {1'b0}};
          numberOfBits <=`LDEC_PAD(1'b0, NOB_HI); 
          firstOutput <= 1'b0;         
          opBitEnOut <= 'd0;
        end
        else if (clrToSend == 1'b1) begin
          opBitEnOut <= `LDEC_OP_WIDTH; // default
          
          opStrobeV = 1'b0;
          opDone <= 1'b0;
          opByteEnOut <= {OB_HI+1 {1'b0}};
          collectionRegV = collectionReg;
          if (firstCollect == 1'b1) begin
            firstOutput <= 1'b1;
            colRegBaseSnapshot <= colRegBase;
          end
          // After an 'output' any remaining data in the collection register must
          // be shifted down.
          if (opStrobe == 1'b1) begin
            collectionRegV = collectionRegV >> `LDEC_OP_WIDTH;
          end
          // New data is begin loaded at colRegBase.
          if (collectEnable == 1'b1) begin
            // With the 'if' this is the same as updatePart (so same as VHDL)
            // but this increases gate-count (though gate-count is still
            // better than with update part).
            if (colRegBase <= CR_LEN_MFS) begin
              collectionRegV[colRegBase +: MFS] = memDataWord;
            end else begin
              collectionRegV[colRegBase +: MFS] = `LDEC_PAD(1'b0, MFS);
            end
//            collectionRegV = updatePart(collectionRegV, colRegBase, memDataWord);
          end
          collectionReg <= collectionRegV;
          
          // Track the number of bits that are about to be output. 
          lastOutputV = 1'b0;
          if (outputEnable== 1'b1 && opDone == 1'b0) begin
            if (firstOutput == 1'b1) begin
              if (`LDEC_CONCAT_OP) begin
                // We may be outputting data from the previous block so there;
                // less than `LDEC_OP_WIDTH data.
                numberOfBitsV = OP_WIDTH_NOB -
                  `LDEC_PAD(colRegBaseSnapshot, NOB_HI-CRB_HI);
              end
              else begin
                numberOfBitsV = OP_WIDTH_NOB;
              end
            end
            else begin
              numberOfBitsV = numberOfBits + OP_WIDTH_NOB;
            end
            if (numberOfBitsV >= k2Reg) begin
              opDone <= 1'b1;
              packetOpCompleteOut <= lastBlock;
              lastOutputV = 1'b1;
              opBitEnOut <= k2Reg-numberOfBits;
            end
            numberOfBits <= numberOfBitsV;
          end
          // Generally we create an opStrobe after the outputEnable signal,
          // however there is also special handling required when the last bits
          // of the LDPC block do ! fill a whole output words.
          if (outputEnable == 1'b1 && opDone == 1'b0) begin
            
            firstOutput <= 1'b0;
            // Generate op strobe except on last output in `LDEC_CONCAT_OP mode.
            // Unless it is the last packet || the last output fully fills the
            // output word.
            if (lastOutputV == 1'b0 || !`LDEC_CONCAT_OP ||
                (colRegBase + `LDEC_PAD(numSamplesToFetch, CRB_HI-NSF_HI) >= OP_WIDTH_CRB || packetComplete == 1'b1)) begin
              opStrobeV = 1'b1;
            end
            else begin
              // In thiscase (the partial output word) retained in order to be  
              // joined with the next block except at the end of the packet.
              opStrobeV = 1'b0;
            end                       // colRegBase
            opByteEnOut <= opStrobeV ? {OB_HI+1 {1'b1}} : {OB_HI+1 {1'b0}};
            
            // If this is the last word && it is ! `LDEC_CONCAT_OP mode, begin
            // zero any unused bits. Do this also for last block in `LDEC_CONCAT_OP mode.
            if (lastOutputV == 1'b1 && (!`LDEC_CONCAT_OP || packetComplete == 1'b1)) begin
              // A partial word may be provided on the last block. Zero any
              // unused bits of the output word.
              opByteEnOut <= {OB_HI+1 {1'b0}};
              for (bitIdx=0; bitIdx<`LDEC_OP_WIDTH; bitIdx=bitIdx+1) begin
                if (bitIdx[CRB_HI:0] >= colRegBase + `LDEC_PAD(numSamplesToFetch, CRB_HI-NSF_HI)) begin
                  collectionReg[bitIdx] <= 1'b0;
                end
                else begin
                  opByteEnOut[bitIdx>>3] <= opStrobeV;
                end
              end
            end
            
          end                         // outputEnable
          opStrobe <= opStrobeV;
        end
      end
    end

  end //pCollect

  //---------------------------------------------------------------------------
  // Pick a word out of the memory read data.
  //---------------------------------------------------------------------------

  always @(shifted, memMux)
  begin : pMemMux
    integer m;
    memDataWord = `LDEC_PAD(1'b0, MFS-1);
    for (m=0; m<=MUX_MAX; m=m+1) begin
      if (memMux == m[MUX_HI:0]) begin
        memDataWord = shifted[MFS*m +: MFS];
      end
    end
  end //pMemMux

  // The clock is enabled on the decodeComplete pulse in order to 'catch' it.
  // During a transfer the clock is stopped if clrToSend is dropped. The
  // opStrobe will still stay high as valid data is presented. The controller
  // can always fetch faster than it can deliver so opStrobe stays high until
  // the final data is delivered.
  assign opClkEn  = newBlk | decodeComplete | decodeCompleteD1 | decodeCompleteD2 |
                    clrToSend | disabling | opDone;

  assign opStrobeOut = opStrobe;
  assign opBufferLockedOut = opBufferLocked;
  assign opDataWordOut = collectionReg[`LDEC_OP_WIDTH-1:0];
  assign hdRamSelOut = hdRamSel & opClkEn;
  assign hdRamAddrOut = hdRamAddr;
  assign opClkEnOut = opClkEn;
  assign opDoneOut = opDone;
  
endmodule

// pMemMux (shifting) and updatePart are best done using raw Verilog. Gate count is
// still higher, because logically the functionality is more complicated.

    
     
    
