//
// This is the phi block of the svd system, incorporating the phase/angle cordics and other logic.
//

`default_nettype none

module phi (
  ///////////////////////////////////////////////
  //$port_g Clock and reset
  ///////////////////////////////////////////////
  input  wire                 BFRModemClk,          // BFR Modem Clock
  input  wire                 nBFRModemRst,         // Active Low Reset(Modem Domain)
  ///////////////////////////////////////////////
  //$port_g configuration and control
  ///////////////////////////////////////////////
  input  wire       [1:0]     cfgNr,
  input  wire                 tctlAdvance,       
  
`ifdef RW_TXRX_2X2
  ///////////////////////////////////////////////
  //Singular vectors computation, only for NRx=2
  ///////////////////////////////////////////////
  //$port_g Hermatian inputs.
  input wire        [11:0]    hermM11,
  input wire signed [12:0]    hermMRe12,
  input wire signed [12:0]    hermMIm12,
  //$port_g lambda inputs.
  input wire        [12:0]    lambdaLambda1,
  input wire        [12:0]    lambdaLambda2,
`endif // RW_TXRX_2X2
  
  ///////////////////////////////////////////////
  //$port_g H vector inputs.
  ///////////////////////////////////////////////
  input wire signed [12:0]    HMemIFGroupedRe11,
  input wire signed [12:0]    HMemIFGroupedIm11,
  input wire signed [12:0]    HMemIFGroupedRe12,
  input wire signed [12:0]    HMemIFGroupedIm12,
//`ifdef RW_MUMIMO_RX_EN
  input wire signed [12:0]    HMemIFGroupedRe13,
  input wire signed [12:0]    HMemIFGroupedIm13,
  input wire signed [12:0]    HMemIFGroupedRe14,
  input wire signed [12:0]    HMemIFGroupedIm14,
//`endif // RW_MUMIMO_RX_EN
`ifdef RW_TXRX_2X2
  input wire signed [12:0]    HMemIFGroupedRe21,
  input wire signed [12:0]    HMemIFGroupedIm21,
  input wire signed [12:0]    HMemIFGroupedRe22,
  input wire signed [12:0]    HMemIFGroupedIm22,
//`ifdef RW_MUMIMO_RX_EN
  input wire signed [12:0]    HMemIFGroupedRe23,
  input wire signed [12:0]    HMemIFGroupedIm23,
  input wire signed [12:0]    HMemIFGroupedRe24,
  input wire signed [12:0]    HMemIFGroupedIm24,
//`endif // RW_MUMIMO_RX_EN
`endif // RW_TXRX_2X2

  ///////////////////////////////////////////////
  //$port_g outputs
  ///////////////////////////////////////////////
//`ifdef RW_MUMIMO_RX_EN
  output wire       [11:0]    phiPhi21,
  output wire       [11:0]    phiPhi31,
  output wire       [11:0]    phiPsi31,
  output wire       [11:0]    phiPsi41,
`ifdef RW_TXRX_2X2
  output wire       [11:0]    phiPsi32,
  output wire       [11:0]    phiPsi42,
  output wire       [11:0]    phiPhi22,
  output wire       [11:0]    phiPhi32,
`endif // RW_TXRX_2X2
//`endif // RW_MUMIMO_RX_EN
  output wire       [11:0]    phiPhi11,
  output wire       [11:0]    phiPsi21
  );
  

  localparam NR_UNSUPPORTED = 2'd0, NR2 = 2'd1, NR3 = 2'd2, NR4 = 2'd3; // Nr index = Nr-1
`ifdef RW_TXRX_2X2
  localparam VFWIDTH = 28;
`else
  localparam VFWIDTH = 13;
`endif

`ifdef RW_TXRX_2X2
  // pipeline HMem data to align with lambda
  reg  signed [12:0]     pHMemIFGroupedRe11;        // Pipeline delayed of HMemIFGrouped.
  reg  signed [12:0]     pHMemIFGroupedIm11;
  reg  signed [12:0]     pHMemIFGroupedRe12;
  reg  signed [12:0]     pHMemIFGroupedIm12;
  reg  signed [12:0]     pHMemIFGroupedRe21;
  reg  signed [12:0]     pHMemIFGroupedIm21;
  reg  signed [12:0]     pHMemIFGroupedRe22;
  reg  signed [12:0]     pHMemIFGroupedIm22;

  //Singular vectors computation, only for NRx=2
  reg         [11:0]     pHermM11;                // Pipeline delayed versions of hermM
  reg  signed [12:0]     pHermMRe12;
  reg  signed [12:0]     pHermMIm12;
  // eigen vectors
  reg  signed [13:0]     URe11;
  reg  signed [13:0]     UIm11;
  wire signed [13:0]     URe12;
  wire signed [13:0]     UIm12;
  reg  signed [13:0]     URe21;
  reg  signed [13:0]     URe22;
`endif // RW_TXRX_2X2

  reg  signed [12:0]     HtRe11;            // Transposed version of HMemIFGrouped.
  reg  signed [12:0]     HtIm11;            // 4x2
  reg  signed [12:0]     HtRe21;
  reg  signed [12:0]     HtIm21;
`ifdef RW_TXRX_2X2
  reg  signed [12:0]     HtRe12;
  reg  signed [12:0]     HtIm12;
  reg  signed [12:0]     HtRe22;
  reg  signed [12:0]     HtIm22;
`endif

  wire signed [VFWIDTH-1:0]     VfRe11;    // Vf 4x2
  wire signed [VFWIDTH-1:0]     VfIm11;
  wire signed [VFWIDTH-1:0]     VfRe21;
  wire signed [VFWIDTH-1:0]     VfIm21;
`ifdef RW_TXRX_2X2
  wire signed [VFWIDTH-1:0]     VfRe12;
  wire signed [VFWIDTH-1:0]     VfIm12;
  wire signed [VFWIDTH-1:0]     VfRe22;
  wire signed [VFWIDTH-1:0]     VfIm22;
`endif
  
  wire        [VFWIDTH-1:0]     Vv1;       // Vv
  wire        [ 4:0]     Vsc1;             // Vsc
  reg         [ 4:0]     dVsc1;            // Delayed version of Vsc

  wire signed [12:0]     VRe11;            // V
  wire signed [12:0]     VIm11;
  wire signed [12:0]     VRe21;
  wire signed [12:0]     VIm21;

  reg  signed [12:0]     pVRe11;           // Pipelined version of V
  reg  signed [12:0]     pVIm11;
  reg  signed [12:0]     pVRe21;
  reg  signed [12:0]     pVIm21;

`ifdef RW_TXRX_2X2
  wire        [VFWIDTH-1:0]     Vv2;
  wire        [ 4:0]     Vsc2;
  reg         [ 4:0]     dVsc2;

  wire signed [12:0]     VRe12;
  wire signed [12:0]     VIm12;
  wire signed [12:0]     VRe22;
  wire signed [12:0]     VIm22;

  reg  signed [12:0]     pVRe12;
  reg  signed [12:0]     pVIm12;
  reg  signed [12:0]     pVRe22;
  reg  signed [12:0]     pVIm22;

  reg  signed [12:0]     ppVRe12;          // 2 x Pipelined version of V
  reg  signed [12:0]     ppVIm12;
  reg  signed [12:0]     ppVRe22;
  reg  signed [12:0]     ppVIm22;
`endif
    
  wire                   CordicADone;
  wire                   CordicPsi1Done;

  wire signed [11:0]     phix11;
  wire signed [11:0]     phix21;
  reg  signed [11:0]     SubK;

  wire signed [12:0]     phiy11;
  wire signed [11:0]     phiz11;
  
  wire signed [11:0]     psi21;

  wire signed [12:0]     A1;       // A
  wire signed [12:0]     A2;
  
  
  //
  // Wires and registers that are not needed in the MU-MIMO case.
  //
//`ifdef RW_MUMIMO_RX_EN

  wire signed [12:0]     A3;
  wire signed [12:0]     A4;
  reg  signed [12:0]     pA3;      // Version of A delayed by one pipeline tick.
  reg  signed [12:0]     pA4;
  
  reg  signed [12:0]     BRe3;     // B
  wire signed [12:0]     BIm3;
  reg  signed [12:0]     BRe4;   
  wire signed [12:0]     BIm4;
  wire signed [22:0]     BRe3tmp;
  wire signed [22:0]     BRe4tmp;
  
  reg  signed [12:0]     HtRe31;
  reg  signed [12:0]     HtIm31;
  reg  signed [12:0]     HtRe41;
  reg  signed [12:0]     HtIm41;

  wire signed [12:0]     VRe31; // V        
  wire signed [12:0]     VIm31;
  wire signed [12:0]     VRe41;
  wire signed [12:0]     VIm41;

  reg  signed [12:0]     pVRe31;        
  reg  signed [12:0]     pVIm31;
  reg  signed [12:0]     pVRe41;
  reg  signed [12:0]     pVIm41;

  wire signed [VFWIDTH-1:0]     VfRe31;  // Vf
  wire signed [VFWIDTH-1:0]     VfIm31;
  wire signed [VFWIDTH-1:0]     VfRe41;
  wire signed [VFWIDTH-1:0]     VfIm41;

  wire signed [11:0]     psi41;

  wire signed [11:0]     psi31;

  wire signed [11:0]     phix31;
  wire signed [12:0]     phiy31;
  wire signed [11:0]     phiz31;

  wire signed [11:0]     phix41;  

  wire signed [12:0]     phiy21;
  wire signed [11:0]     phiz21;

  wire                   CordicB3Done;
  wire                   CordicB4Done;
  wire                   CordicPsi31Done;

`ifdef RW_TXRX_2X2
  // pipeline HMem data to align with lambda
  reg  signed [12:0]     pHMemIFGroupedRe13;
  reg  signed [12:0]     pHMemIFGroupedIm13;
  reg  signed [12:0]     pHMemIFGroupedRe14;
  reg  signed [12:0]     pHMemIFGroupedIm14;
  reg  signed [12:0]     pHMemIFGroupedRe23;
  reg  signed [12:0]     pHMemIFGroupedIm23;
  reg  signed [12:0]     pHMemIFGroupedRe24;
  reg  signed [12:0]     pHMemIFGroupedIm24;

  wire signed [12:0]     CRe1;     // C
  wire signed [12:0]     CIm1;
  wire signed [12:0]     CRe2;
  wire signed [12:0]     CIm2;
  wire signed [12:0]     CRe3;
  wire signed [12:0]     CIm3;
  reg  signed [12:0]     pCRe3;   // Version of C delayed by 1 pipeline tick.
  reg  signed [12:0]     pCIm3;

  wire signed [12:0]     DRe;     // D
  wire signed [12:0]     DIm;
  wire signed [24:0]     DReA;    // Intermediate version of D.
  wire signed [24:0]     DImA;
  reg  signed [12:0]     dDRe;    // D delayed by one clock tick.
  reg  signed [12:0]     dDIm;

  wire signed [12:0]     ExRe2;   // Intermediate version of E, pre Overflow and shift.
  wire signed [12:0]     ExIm2;
  wire signed [12:0]     ExRe3;
  wire signed [12:0]     ExIm3;
  
  wire signed [12:0]     ERe2;     // E
  wire signed [12:0]     EIm2;
  wire signed [12:0]     ERe3;
  wire signed [12:0]     EIm3;

  wire signed [12:0]     FRe;    // F
  wire signed [12:0]     FIm;

  wire signed [12:0]     Icr1;
  wire signed [12:0]     Icr2;

  wire signed [12:0]     O2;     // O
  wire signed [12:0]     O3;
  reg  signed [12:0]     pO2;       // O2 delayed by one pipeline delay.  
  reg  signed [12:0]     ppO2;      // O2 delayed by two pipeline delay.
  wire signed [12:0]     ONr;
  reg  signed [12:0]     pONr;      // ONr delayed by one pipeline delay.
  reg  signed [12:0]     ppGxONr;   // ONr delayed by two pipeline delays + mult logic

  reg  signed [12:0]     HtRe32;
  reg  signed [12:0]     HtIm32;
  reg  signed [12:0]     HtRe42;
  reg  signed [12:0]     HtIm42;

  wire signed [12:0]     tmpR8;
  reg  signed [13:0]     R8;
  
  reg  signed [12:0]     VRe32; 
  reg  signed [12:0]     VIm32;
  wire signed [12:0]     VRe42;
  wire signed [12:0]     VIm42;

  reg  signed [12:0]     pVRe32;
  reg  signed [12:0]     pVIm32;
  reg  signed [12:0]     pVRe42;
  reg  signed [12:0]     pVIm42;

  reg  signed [12:0]     ppVRe32;
  reg  signed [12:0]     ppVIm32;
  reg  signed [12:0]     ppVRe42;
  reg  signed [12:0]     ppVIm42;

  wire signed [VFWIDTH-1:0]     VfRe32;
  wire signed [VFWIDTH-1:0]     VfIm32;
  wire signed [VFWIDTH-1:0]     VfRe42;
  wire signed [VFWIDTH-1:0]     VfIm42;
  
  wire signed [12:0]     iVRe32;
  wire signed [12:0]     iVIm32;
  
  reg  signed [12:0]     VtRe;
  reg  signed [12:0]     VtIm;
  reg  signed [12:0]     pVtRe;
  reg  signed [12:0]     pVtIm;

  reg  signed [24:0]     gx;
  wire signed [24:0]     tmpOs;

  wire signed [12:0]     phiNr2;
  wire signed [11:0]     phixNr2;
  reg  signed [12:0]     pPhiNr2;  // PhiNr2 delayed by one pipeline tick.

  reg  signed [11:0]     psiNr1;  

  wire signed [11:0]     psi32Nr4;
  wire        [11:0]     psi32Nr4Pos;

  wire signed [11:0]     psiNr2;
  wire        [11:0]     psiNr2Pos;

  wire signed [11:0]     phix22;
  wire signed [12:0]     phiy22;
  wire signed [11:0]     phiz22;

  wire signed [11:0]     phix32;
  wire signed [12:0]     phiy32;
  wire signed [11:0]     phiz32;

  reg  signed [11:0]     pPsi31;
  reg  signed [11:0]     ppPsi31;

  wire signed [11:0]     minusPhiz11;
  wire signed [11:0]     minusPhiz21;
  wire signed [11:0]     minusPhiz31;

  wire                   CordicCDone;
  wire                   CordicE2Done;   
  wire                   CordicE3Done;
  wire                   CordicCr10Done;
  wire                   CordicPhi7Done;
  wire                   CordicPhi8Done;
`endif  
//`endif  

  //
  // When NRx=2, delay the HMemIFGrouped data by one pipeline period to line them up with the lambda data
  // as the lambda data arrives one pipeline tick later.
  //
`ifdef RW_TXRX_2X2
  always @(posedge BFRModemClk or negedge nBFRModemRst)
    begin
      if (!nBFRModemRst)
        begin
          pHMemIFGroupedRe11   <= 13'd0;
          pHMemIFGroupedIm11   <= 13'd0;
          pHMemIFGroupedRe12   <= 13'd0;
          pHMemIFGroupedIm12   <= 13'd0;
          pHMemIFGroupedRe21   <= 13'd0;
          pHMemIFGroupedIm21   <= 13'd0;
          pHMemIFGroupedRe22   <= 13'd0;
          pHMemIFGroupedIm22   <= 13'd0;
//`ifdef RW_MUMIMO_RX_EN
          pHMemIFGroupedRe13   <= 13'd0;
          pHMemIFGroupedIm13   <= 13'd0;
          pHMemIFGroupedRe14   <= 13'd0;
          pHMemIFGroupedIm14   <= 13'd0;
          pHMemIFGroupedRe23   <= 13'd0;
          pHMemIFGroupedIm23   <= 13'd0;
          pHMemIFGroupedRe24   <= 13'd0;
          pHMemIFGroupedIm24   <= 13'd0;
//`endif          
        end
      else
        begin
          //
          // Register the HMemIFGrouped values whenever tctlAdvance is asserted.
          //
          if (tctlAdvance)
            begin
              pHMemIFGroupedRe11 <= HMemIFGroupedRe11;
              pHMemIFGroupedIm11 <= HMemIFGroupedIm11;
              pHMemIFGroupedRe12 <= HMemIFGroupedRe12;
              pHMemIFGroupedIm12 <= HMemIFGroupedIm12;
              pHMemIFGroupedRe21 <= HMemIFGroupedRe21;
              pHMemIFGroupedIm21 <= HMemIFGroupedIm21;
              pHMemIFGroupedRe22 <= HMemIFGroupedRe22;
              pHMemIFGroupedIm22 <= HMemIFGroupedIm22;
//`ifdef RW_MUMIMO_RX_EN              
              pHMemIFGroupedRe13 <= HMemIFGroupedRe13;
              pHMemIFGroupedIm13 <= HMemIFGroupedIm13;
              pHMemIFGroupedRe14 <= HMemIFGroupedRe14;
              pHMemIFGroupedIm14 <= HMemIFGroupedIm14;
              pHMemIFGroupedRe23 <= HMemIFGroupedRe23;
              pHMemIFGroupedIm23 <= HMemIFGroupedIm23;
              pHMemIFGroupedRe24 <= HMemIFGroupedRe24;
              pHMemIFGroupedIm24 <= HMemIFGroupedIm24;
//`endif              
            end
        end
    end
`endif  
   
   
   //
   // Calculate H transposed.The only maths done here is the negating of the imaginary parts.
   //
   always @(posedge BFRModemClk or negedge nBFRModemRst)
     begin
       if (!nBFRModemRst)
         begin
           HtRe11 <= 13'd0;
           HtIm11 <= 13'd0;
           HtRe21 <= 13'd0;
           HtIm21 <= 13'd0;
`ifdef RW_TXRX_2X2
           HtRe12 <= 13'd0;
           HtIm12 <= 13'd0;
           HtRe22 <= 13'd0;
           HtIm22 <= 13'd0;
`endif           
//`ifdef RW_MUMIMO_RX_EN           
           HtRe31 <= 13'd0;
           HtIm31 <= 13'd0;
           HtRe41 <= 13'd0;
           HtIm41 <= 13'd0;
`ifdef RW_TXRX_2X2
           HtRe32 <= 13'd0;
           HtIm32 <= 13'd0;
           HtRe42 <= 13'd0;
           HtIm42 <= 13'd0;
`endif
//`endif           
         end
       else
         begin
// two antennae. Delay by 1 tctl tick HMem in pHMem to align with lambda
`ifdef RW_TXRX_2X2
           if (tctlAdvance)
             begin
               HtRe11 <= pHMemIFGroupedRe11;
               HtIm11 <= -pHMemIFGroupedIm11;
               HtRe21 <= pHMemIFGroupedRe12;
               HtIm21 <= -pHMemIFGroupedIm12;
               HtRe12 <= pHMemIFGroupedRe21;
               HtIm12 <= -pHMemIFGroupedIm21;
               HtRe22 <= pHMemIFGroupedRe22;
               HtIm22 <= -pHMemIFGroupedIm22;
//`ifdef RW_MUMIMO_RX_EN               
               HtRe31 <= pHMemIFGroupedRe13;
               HtIm31 <= -pHMemIFGroupedIm13;
               HtRe41 <= pHMemIFGroupedRe14;
               HtIm41 <= -pHMemIFGroupedIm14;
               HtRe32 <= pHMemIFGroupedRe23;
               HtIm32 <= -pHMemIFGroupedIm23;
               HtRe42 <= pHMemIFGroupedRe24;
               HtIm42 <= -pHMemIFGroupedIm24;
             end
//`endif

// only one antenna. Lambda is not used. No need to delay by 1 tctl tick in pHMem
`else
               HtRe11 <= HMemIFGroupedRe11;
               HtIm11 <= -HMemIFGroupedIm11;
               HtRe21 <= HMemIFGroupedRe12;
               HtIm21 <= -HMemIFGroupedIm12;
//`ifdef RW_MUMIMO_RX_EN               
               HtRe31 <= HMemIFGroupedRe13;
               HtIm31 <= -HMemIFGroupedIm13;
               HtRe41 <= HMemIFGroupedRe14;
               HtIm41 <= -HMemIFGroupedIm14;
//`endif
`endif
         end
     end
   
`ifdef RW_TXRX_2X2

   //
   // Delay the hermM data by one pipeline period to line them up with the lambda data
   // as the lambda data arrives one pipeline tick later.
   // Calculate U. For timing reasons URe11, UIm11, URe21 and URe22 are generated a tick earlier and registered.
   //
   assign URe12 = URe11;
   assign UIm12 = UIm11;
   
   always @(posedge BFRModemClk or negedge nBFRModemRst)
     begin
       if (!nBFRModemRst)
         begin
           pHermM11    <= 12'd0;                
           pHermMRe12  <= 13'd0;
           pHermMIm12  <= 13'd0;
           URe11       <= 14'd0;
           UIm11       <= 14'd0;
           URe21       <= 14'd0;
           URe22       <= 14'd0;
         end
       else
         begin
           if (tctlAdvance)
             begin
               // Register the hermM inputs whenever tctlAdvance is asserted.
               pHermM11    <= hermM11;                
               pHermMRe12  <= hermMRe12;
               pHermMIm12  <= hermMIm12;
               URe11       <= -{pHermMRe12[12],pHermMRe12};
               UIm11       <= -{pHermMIm12[12],pHermMIm12};
               URe21       <= {2'b00,pHermM11} - {1'b0,lambdaLambda1};
               URe22       <= {2'b00,pHermM11} - {1'b0,lambdaLambda2};
             end
         end
     end

   //
   // Multiply Ht by U to get Vf.
   // The cmult13 module implements Z = A*B+C*D. Note that in this usage the imaginary part of D is always
   // zero and thus is not included in the port list.

   //
   // Vf(1,1) = Ht(1,1)*U(1,1)+Ht(1,2)*U(2,1)
   //
   cmult13 m11(.BFRModemClk(BFRModemClk),.nBFRModemRst(nBFRModemRst),.tctlAdvance(tctlAdvance),.ARe(HtRe11),.AIm(HtIm11),.BRe(URe11),.BIm(UIm11),.CRe(HtRe12),.CIm(HtIm12),.DRe(URe21),.ZRe(VfRe11),.ZIm(VfIm11));
   //
   // Vf(2,1) = Ht(2,1)*U(1,1)+Ht(2,2)*U(2,1)
   //
   cmult13 m21(.BFRModemClk(BFRModemClk),.nBFRModemRst(nBFRModemRst),.tctlAdvance(tctlAdvance),.ARe(HtRe21),.AIm(HtIm21),.BRe(URe11),.BIm(UIm11),.CRe(HtRe22),.CIm(HtIm22),.DRe(URe21),.ZRe(VfRe21),.ZIm(VfIm21));

   // Vf(1,2) = Ht(1,1)*U(1,2)+Ht(1,2)*U(2,2)
   //
   cmult13 m12(.BFRModemClk(BFRModemClk),.nBFRModemRst(nBFRModemRst),.tctlAdvance(tctlAdvance),.ARe(HtRe11),.AIm(HtIm11),.BRe(URe12),.BIm(UIm12),.CRe(HtRe12),.CIm(HtIm12),.DRe(URe22),.ZRe(VfRe12),.ZIm(VfIm12));
   //
   // Vf(2,2) = Ht(2,1)*U(1,2)+Ht(2,2)*U(2,2)
   //
   cmult13 m22(.BFRModemClk(BFRModemClk),.nBFRModemRst(nBFRModemRst),.tctlAdvance(tctlAdvance),.ARe(HtRe21),.AIm(HtIm21),.BRe(URe12),.BIm(UIm12),.CRe(HtRe22),.CIm(HtIm22),.DRe(URe22),.ZRe(VfRe22),.ZIm(VfIm22));

//`ifdef RW_MUMIMO_RX_EN
   //
   // Vf(3,1) = Ht(3,1)*U(1,1)+Ht(3,2)*U(2,1)
   //
   cmult13 m31(.BFRModemClk(BFRModemClk),.nBFRModemRst(nBFRModemRst),.tctlAdvance(tctlAdvance),.ARe(HtRe31),.AIm(HtIm31),.BRe(URe11),.BIm(UIm11),.CRe(HtRe32),.CIm(HtIm32),.DRe(URe21),.ZRe(VfRe31),.ZIm(VfIm31));
   //
   // Vf(4,1) = Ht(4,1)*U(1,1)+Ht(4,2)*U(2,1)
   //
   cmult13 m41(.BFRModemClk(BFRModemClk),.nBFRModemRst(nBFRModemRst),.tctlAdvance(tctlAdvance),.ARe(HtRe41),.AIm(HtIm41),.BRe(URe11),.BIm(UIm11),.CRe(HtRe42),.CIm(HtIm42),.DRe(URe21),.ZRe(VfRe41),.ZIm(VfIm41));

   //
   // Vf(3,2) = Ht(3,1)*U(1,2)+Ht(3,2)*U(2,2)
   //
   cmult13 m32(.BFRModemClk(BFRModemClk),.nBFRModemRst(nBFRModemRst),.tctlAdvance(tctlAdvance),.ARe(HtRe31),.AIm(HtIm31),.BRe(URe12),.BIm(UIm12),.CRe(HtRe32),.CIm(HtIm32),.DRe(URe22),.ZRe(VfRe32),.ZIm(VfIm32));   
   //
   // Vf(4,2) = Ht(4,1)*U(1,2)+Ht(4,2)*U(2,2)
   //
   cmult13 m42(.BFRModemClk(BFRModemClk),.nBFRModemRst(nBFRModemRst),.tctlAdvance(tctlAdvance),.ARe(HtRe41),.AIm(HtIm41),.BRe(URe12),.BIm(UIm12),.CRe(HtRe42),.CIm(HtIm42),.DRe(URe22),.ZRe(VfRe42),.ZIm(VfIm42));

//`endif

// Only one antenna. No multipliers
`else // RW_TXRX_2X2
assign VfRe11 = HtRe11;
assign VfIm11 = HtIm11;
assign VfRe21 = HtRe21;
assign VfIm21 = HtIm21;
//`ifdef RW_MUMIMO_RX_EN
assign VfRe31 = HtRe31;
assign VfIm31 = HtIm31;
assign VfRe41 = HtRe41;
assign VfIm41 = HtIm41;
//`endif
`endif


   //
   // Calculate vv1 and vv2. Each one is the max abs value of the real or complex part of a column. 
   // Calulate Vsc1 and Vsc2 by getting the log2 of Vv1 and Vv2. 
   //
           
   maxabs # (.WIDTH(VFWIDTH))
          ma1(.BFRModemClk(BFRModemClk), .nBFRModemRst(nBFRModemRst), .I1(VfRe11), .I2(VfIm11), .I3(VfRe21), .I4(VfIm21),
//              `ifdef RW_MUMIMO_RX_EN   
              .I5(VfRe31), .I6(VfIm31), .I7(VfRe41), .I8(VfIm41),
//              `endif
              .Z(Vv1));

   log2floor # (.WIDTH(VFWIDTH))
          vl1(.log2in(Vv1),.result(Vsc1));

`ifdef RW_TXRX_2X2
   maxabs # (.WIDTH(VFWIDTH))
          ma2(.BFRModemClk(BFRModemClk), .nBFRModemRst(nBFRModemRst), .I1(VfRe12), .I2(VfIm12), .I3(VfRe22), .I4(VfIm22),
//              `ifdef RW_MUMIMO_RX_EN   
              .I5(VfRe32), .I6(VfIm32), .I7(VfRe42), .I8(VfIm42),
//              `endif
              .Z(Vv2));

   log2floor # (.WIDTH(VFWIDTH))
          vl2(.log2in(Vv2),.result(Vsc2));

`endif


   //
   // Register Vsc to meet timing, and add max to value 11.
   //
   always @(posedge BFRModemClk or negedge nBFRModemRst)
     begin
       if (!nBFRModemRst)
         begin
           dVsc1 <= 5'd0;
`ifdef RW_TXRX_2X2
           dVsc2 <= 5'd0;
`endif
         end
       else
         begin
           if (Vsc1 <= 5'd11) 
             dVsc1 <= 5'd11;
           else
             dVsc1 <= Vsc1;
`ifdef RW_TXRX_2X2
           if (Vsc2 <= 5'd11) 
             dVsc2 <= 5'd11;
           else
             dVsc2 <= Vsc2;
`endif
         end
     end

   //
   // V(x,1) = round(Vf(x,1)*2^(Vsc1-11))
   // Valid value of V is aligned with dVsc, one clock cyle after Vf
   //
   roundscale # (.WIDTH(VFWIDTH)) rs1(.I(VfRe11),.P(dVsc1),.Z(VRe11));
   roundscale # (.WIDTH(VFWIDTH)) rs2(.I(VfIm11),.P(dVsc1),.Z(VIm11));
   roundscale # (.WIDTH(VFWIDTH)) rs3(.I(VfRe21),.P(dVsc1),.Z(VRe21));
   roundscale # (.WIDTH(VFWIDTH)) rs4(.I(VfIm21),.P(dVsc1),.Z(VIm21));
//`ifdef RW_MUMIMO_RX_EN   
   roundscale # (.WIDTH(VFWIDTH)) rs5(.I(VfRe31),.P(dVsc1),.Z(VRe31));
   roundscale # (.WIDTH(VFWIDTH)) rs6(.I(VfIm31),.P(dVsc1),.Z(VIm31));
   roundscale # (.WIDTH(VFWIDTH)) rs7(.I(VfRe41),.P(dVsc1),.Z(VRe41));
   roundscale # (.WIDTH(VFWIDTH)) rs8(.I(VfIm41),.P(dVsc1),.Z(VIm41));
//`endif

`ifdef RW_TXRX_2X2
   //
   // V(x,2) = round(Vf(x,2)*2^(Vsc2-11))
   // Valid value of V is aligned with dVsc, one clock cyle after Vf
   //
   roundscale # (.WIDTH(VFWIDTH)) rs9 (.I(VfRe12),.P(dVsc2),.Z(VRe12));
   roundscale # (.WIDTH(VFWIDTH)) rs10(.I(VfIm12),.P(dVsc2),.Z(VIm12));
   roundscale # (.WIDTH(VFWIDTH)) rs11(.I(VfRe22),.P(dVsc2),.Z(VRe22));
   roundscale # (.WIDTH(VFWIDTH)) rs12(.I(VfIm22),.P(dVsc2),.Z(VIm22));
//`ifdef RW_MUMIMO_RX_EN
   roundscale # (.WIDTH(VFWIDTH)) rs13(.I(VfRe32),.P(dVsc2),.Z(iVRe32));
   roundscale # (.WIDTH(VFWIDTH)) rs14(.I(VfIm32),.P(dVsc2),.Z(iVIm32));
   roundscale # (.WIDTH(VFWIDTH)) rs15(.I(VfRe42),.P(dVsc2),.Z(VRe42));
   roundscale # (.WIDTH(VFWIDTH)) rs16(.I(VfIm42),.P(dVsc2),.Z(VIm42));
   
   //
   // Delay VRe32 and VIm32 by one tick, this makes it easier for synthesis to meet its timing goals.
   //
   always @(posedge BFRModemClk or negedge nBFRModemRst)
     begin
       if (!nBFRModemRst)
         begin
           VRe32 <= 13'd0;
           VIm32 <= 13'd0;
         end
       else
         begin
           VRe32 <= iVRe32;
           VIm32 <= iVIm32;
         end
     end

//`endif

   always @(*)
     begin
       if (cfgNr == NR3)
         begin
           VtRe = ppVRe32;
           VtIm = ppVIm32;
         end
       else // NR4
         begin
           VtRe = ppVRe42;
           VtIm = ppVIm42;
         end
     end
`endif

   //
   // Delay V by one pipeline stage to make pV and by a second one to make ppV.
   //
   always @(posedge BFRModemClk or negedge nBFRModemRst)
     begin
       if (!nBFRModemRst)
         begin
           pVRe11  <= 13'd0;
           pVIm11  <= 13'd0;
           pVRe21  <= 13'd0;
           pVIm21  <= 13'd0;
`ifdef RW_TXRX_2X2
           pVRe12  <= 13'd0;
           pVIm12  <= 13'd0;
           pVRe22  <= 13'd0;
           pVIm22  <= 13'd0;

           ppVRe12 <= 13'd0;
           ppVIm12 <= 13'd0;
           ppVRe22 <= 13'd0;
           ppVIm22 <= 13'd0;
`endif
//`ifdef RW_MUMIMO_RX_EN           
           pVRe31  <= 13'd0;       
           pVIm31  <= 13'd0;
           pVRe41  <= 13'd0;
           pVIm41  <= 13'd0;
`ifdef RW_TXRX_2X2
           pVRe32  <= 13'd0;
           pVIm32  <= 13'd0;
           pVRe42  <= 13'd0;
           pVIm42  <= 13'd0;

           ppVRe32 <= 13'd0;
           ppVIm32 <= 13'd0;
           ppVRe42 <= 13'd0;
           ppVIm42 <= 13'd0;

           pVtRe   <= 13'd0;
           pVtIm   <= 13'd0;
`endif           
//`endif           

         end
       else
         begin
           if (tctlAdvance)
             begin
               pVRe11  <= VRe11;
               pVIm11  <= VIm11;
               pVRe21  <= VRe21;
               pVIm21  <= VIm21;
`ifdef RW_TXRX_2X2
               pVRe12  <= VRe12;
               pVIm12  <= VIm12;
               pVRe22  <= VRe22;
               pVIm22  <= VIm22;

               ppVRe12 <= pVRe12;
               ppVIm12 <= pVIm12;
               ppVRe22 <= pVRe22;
               ppVIm22 <= pVIm22;
`endif
//`ifdef RW_MUMIMO_RX_EN               
               pVRe31  <= VRe31;       
               pVIm31  <= VIm31;
               pVRe41  <= VRe41;
               pVIm41  <= VIm41;
`ifdef RW_TXRX_2X2
               pVRe32  <= VRe32;
               pVIm32  <= VIm32;
               pVRe42  <= VRe42;
               pVIm42  <= VIm42;

               ppVRe32 <= pVRe32;
               ppVIm32 <= pVIm32;
               ppVRe42 <= pVRe42;
               ppVIm42 <= pVIm42;

               pVtRe   <= VtRe;
               pVtIm   <= VtIm;
`endif
//`endif               
             end
         end
     end
   
   //
   // Instantiate the CordicPhase blocks to calculate the angle.
   //
   cordicPhase # (.PREPOSTPROC(1)) CordPhi1(.Clk(BFRModemClk), .Rst(nBFRModemRst), .Start(tctlAdvance), .Done(CordicADone), .RIn(pVRe11), .IIn(pVIm11), .ROut(A1), .POut(phix11));
   cordicPhase # (.PREPOSTPROC(1)) CordPhi2(.Clk(BFRModemClk), .Rst(nBFRModemRst), .Start(tctlAdvance), .Done(),            .RIn(pVRe21), .IIn(pVIm21), .ROut(A2), .POut(phix21));

//`ifdef RW_MUMIMO_RX_EN   
   cordicPhase # (.PREPOSTPROC(1)) CordPhi3(.Clk(BFRModemClk), .Rst(nBFRModemRst), .Start(tctlAdvance), .Done(),            .RIn(pVRe31), .IIn(pVIm31), .ROut(A3), .POut(phix31));
   cordicPhase # (.PREPOSTPROC(1)) CordPhi4(.Clk(BFRModemClk), .Rst(nBFRModemRst), .Start(tctlAdvance), .Done(),            .RIn(pVRe41), .IIn(pVIm41), .ROut(A4), .POut(phix41));
//`endif   


   //
   // Calculate psi21
   //
   cordicPhase CordPsi1(.Clk(BFRModemClk), .Rst(nBFRModemRst), .Start(CordicADone), .Done(CordicPsi1Done), .RIn(A1), .IIn(A2), .ROut(), .POut(psi21));
   
   //
   // Calculate psi31, A2 needs to be delayed by a pipeline tick before being input to CordPsi31. It also needs to be
   // multiplied by 1686 >> 12 so do this here too.
   //
//`ifdef RW_MUMIMO_RX_EN
  assign BRe3tmp = {{10{A2[12]}},A2} * 23'sd843;
   always @(posedge BFRModemClk or negedge nBFRModemRst)
     begin
       if (!nBFRModemRst)
         begin
           BRe3  <= 13'd0;
         end
       else
         begin
           if (tctlAdvance)
             begin
               BRe3 <= {BRe3tmp[22],BRe3tmp[22:11]};
             end
         end
     end
     
   cordicRot cr1(.Clk(BFRModemClk), .Rst(nBFRModemRst), .Start(CordicPsi1Done), .Done(CordicB3Done), .RIn(A3), .IIn(13'd0), .ThetaIn(psi21), .ROut(), .IOut(BIm3));
   cordicPhase CordPsi31(.Clk(BFRModemClk), .Rst(nBFRModemRst), .Start(CordicB3Done), .Done(CordicPsi31Done), .RIn(BRe3), .IIn(BIm3), .ROut(), .POut(psi31));
   
   //
   // Calculate psi41. A3 need to be delayed by a pipeline tick before being input to CordPsi41.
   // A4 need to be delayed by 2 pipeline ticks and multiplied by 1686 >> 12 so do this here too.
   //
   assign BRe4tmp = {{10{pA3[12]}},pA3} * 23'sd843;
   always @(posedge BFRModemClk or negedge nBFRModemRst)
     begin
       if (!nBFRModemRst)
         begin
           pA3     <= 13'd0;
           pA4     <= 13'd0;
           BRe4    <= 13'd0;
         end
       else
         begin
           if (tctlAdvance)
             begin
               pA3     <= A3;
               pA4     <= A4;
               BRe4    <= {BRe4tmp[22],BRe4tmp[22:11]};
             end
         end
     end
   
   cordicRot cr2(.Clk(BFRModemClk), .Rst(nBFRModemRst), .Start(CordicPsi31Done), .Done(CordicB4Done), .RIn(pA4), .IIn(13'd0), .ThetaIn(psi31), .ROut(), .IOut(BIm4));
   cordicPhase CordPsi41(.Clk(BFRModemClk), .Rst(nBFRModemRst), .Start(CordicB4Done), .Done(), .RIn(BRe4), .IIn(BIm4), .ROut(), .POut(psi41));

//`endif
    
   // 
   // Assign the psi signals that are also outputs. Note that psi32 can come from two different places depending on
   // the value of cfgNr.
   //
   assign phiPsi21 = psi21[11] ? 12'd0 : psi21[11:0];
//`ifdef RW_MUMIMO_RX_EN       
   assign phiPsi31 = psi31[11] ? 12'd0 : psi31[11:0];    
   assign phiPsi41 = psi41[11] ? 12'd0 : psi41[11:0];    
`ifdef RW_TXRX_2X2
   // psiNr2 is psi32 when nr=3, else psi42
   assign phiPsi32 = (cfgNr== NR3) ? psiNr2Pos : psi32Nr4Pos;
   assign phiPsi42 = psiNr2Pos;
`endif
//`endif    

   //
   // Compute phi:
   // Substract the value of phiNr1: select it based on cfgNr
   //
   always @(*)
     begin
       case (cfgNr)
         NR2    : SubK = phix21;
//`ifdef RW_MUMIMO_RX_EN         
         NR3    : SubK = phix31;
         default: SubK = phix41;
//`else
//         default: SubK = 12'd0;
//`endif         
       endcase
     end
     
   assign phiy11 = {phix11[11],phix11}-{SubK[11],SubK};
   // Signed overflow
   assign phiz11 = phiy11[11:0];

//`ifdef RW_MUMIMO_RX_EN   
   assign phiy21 = {phix21[11],phix21}-{SubK[11],SubK};
   assign phiy31 = {phix31[11],phix31}-{SubK[11],SubK};
   // Signed overflow
   assign phiz21 = phiy21[11:0];
   assign phiz31 = phiy31[11:0];
//`endif   
    
      
   // 
   // assign the phi signals that are used as outputs.
   //
   // Starting here phi values are positive
   //phi(phi<0) = OverflowUsgn(phi(phi<0) + 2^nP,nP);
   assign phiPhi11 = phiz11;    
//`ifdef RW_MUMIMO_RX_EN       
   assign phiPhi21 = (cfgNr == NR2) ? 12'd0 : phiz21; // relevant if Nr>2, so set to zero if Nr=2 
   assign phiPhi31 = (cfgNr == NR4) ? phiz31 : 12'd0; // relevant if Nr>3, i.e. Nr=4    
`ifdef RW_TXRX_2X2
   assign phiPhi22 = phiz22;
   assign phiPhi32 = phiz32;
`endif
//`endif       
   
//`ifdef RW_MUMIMO_RX_EN

`ifdef RW_TXRX_2X2
   //
   // Compute phiNr2 for Nr=3 or 4, i.e. phi32 or phi42
   // 
     
   // The start of this Cordic is aligned with CordPhi6 so that the result is available at the same time
   cordicPhase # (.PREPOSTPROC(1)) CordPhi5(.Clk(BFRModemClk), .Rst(nBFRModemRst), .Start(CordicE2Done), .Done(), .RIn(pVtRe), .IIn(pVtIm), .ROut(ONr), .POut(phixNr2));
 
   //
   // Sign extend phixNr2 to phiNr2 (12 to 13 bits).
   //
   assign phiNr2 = {phixNr2[11],phixNr2};
     
   //
   // Nr = 3,4
   // if Nr > 2 && Nc > 1
   // Compensate phi(:,1) on second column
   //
   assign minusPhiz11 = -phiz11;
   assign minusPhiz21 = -phiz21;
   assign minusPhiz31 = -phiz31;
   //  
   cordicRot cr3(.Clk(BFRModemClk), .Rst(nBFRModemRst), .Start(CordicADone), .Done(),            .RIn(ppVRe12), .IIn(ppVIm12), .ThetaIn(minusPhiz11), .ROut(CRe1), .IOut(CIm1));
   cordicRot cr4(.Clk(BFRModemClk), .Rst(nBFRModemRst), .Start(CordicADone), .Done(),            .RIn(ppVRe22), .IIn(ppVIm22), .ThetaIn(minusPhiz21), .ROut(CRe2), .IOut(CIm2));
   cordicRot cr5(.Clk(BFRModemClk), .Rst(nBFRModemRst), .Start(CordicADone), .Done(CordicCDone), .RIn(ppVRe32), .IIn(ppVIm32), .ThetaIn(minusPhiz31), .ROut(CRe3), .IOut(CIm3));

   //
   // Rotation matrix by psi21
   //
   cordicRot cr6(.Clk(BFRModemClk), .Rst(nBFRModemRst), .Start(CordicCDone), .Done(),            .RIn(CRe2), .IIn(CRe1), .ThetaIn(psi21), .ROut(ExRe2), .IOut(Icr1));
   cordicRot cr7(.Clk(BFRModemClk), .Rst(nBFRModemRst), .Start(CordicCDone), .Done(CordicE2Done),.RIn(CIm2), .IIn(CIm1), .ThetaIn(psi21), .ROut(ExIm2), .IOut(Icr2));


   
   //
   // Nr = 4
   // Rotation matrix by psi31
   //   
   
   assign DReA = {{12{Icr1[12]}},Icr1} * 25'sd2487;   
   assign DImA = {{12{Icr2[12]}},Icr2} * 25'sd2487;
   
   assign DRe = DReA[22:10];
   assign DIm = DImA[22:10];

   //
   // Registers / pipelining.
   //
   always @(posedge BFRModemClk or negedge nBFRModemRst)
     begin
       if (!nBFRModemRst)
         begin
           pCRe3    <= 13'd0;
           pCIm3    <= 13'd0;
           dDRe     <= 13'd0;
           dDIm     <= 13'd0;
         end         
       else
         begin
           //
           // Stage pipelining.
           //
           if (tctlAdvance)
             begin
               pCRe3  <= CRe3;
               pCIm3  <= CIm3;
             end
           //
           // Single clock pipelining.
           //
           dDRe <= DRe;
           dDIm <= DIm;
         end
     end
   
   // psi31 arrives a half pipe delay after pC and D, so use it to start cr8 and cr9
   // Use this delay to have the flop on D after the multiplier
   cordicRot cr8(.Clk(BFRModemClk), .Rst(nBFRModemRst), .Start(CordicPsi31Done), .Done(),            .RIn(pCRe3), .IIn(dDRe),  .ThetaIn(psi31), .ROut(ExRe3), .IOut());
   cordicRot cr9(.Clk(BFRModemClk), .Rst(nBFRModemRst), .Start(CordicPsi31Done), .Done(CordicE3Done),.RIn(pCIm3), .IIn(dDIm),  .ThetaIn(psi31), .ROut(ExIm3), .IOut());
   
   //
   // Signed overflow and shift by 1
   //
   assign ERe2 = {ExRe2[11:0],1'b0};
   assign EIm2 = {ExIm2[11:0],1'b0};
   assign ERe3 = {ExRe3[11:0],1'b0};
   assign EIm3 = {ExIm3[11:0],1'b0};
 
   //
   // Compute phi(2:3,2)
   //
      
   //
   // Delay phiNr2 by one pipeline delay.
   //
   always @(posedge BFRModemClk or negedge nBFRModemRst)
     begin
       if (!nBFRModemRst)
         begin
           pPhiNr2    <= 13'd0;
         end
       else
         begin
           if (tctlAdvance)
             begin
               pPhiNr2  <= phiNr2;
             end
         end
     end
     
   // Warning! is Start timing of CordPhi6 is changed, CordPhi5 start must be aligned
   cordicPhase # (.PREPOSTPROC(1)) CordPhi6(.Clk(BFRModemClk), .Rst(nBFRModemRst), .Start(CordicE2Done), .Done(), .RIn(ERe2), .IIn(EIm2), .ROut(O2), .POut(phix22));
   
   // phi22 is obtained by substracting phi32 or phi42 depending on Nr. Correct value is stored in phiNr2. 
   assign phiy22 = {phix22[11],phix22}-phiNr2;
   assign phiz22 = phiy22[11:0];
        
   cordicPhase # (.PREPOSTPROC(1)) CordPhi7(.Clk(BFRModemClk), .Rst(nBFRModemRst), .Start(CordicE3Done), .Done(CordicPhi7Done), .RIn(ERe3), .IIn(EIm3), .ROut(O3), .POut(phix32));
   
   // phi32 is used only when Nr=4 (in packer). Obtain value by substracting phi42 stored in phiNr2 
   assign phiy32 = {phix32[11],phix32}-pPhiNr2;
   assign phiz32 = phiy32[11:0];


   //
   // Compute psi(3:Nr,2)
   //   
   
   //
   // Delay O2 and psi31 by two pipeline delays.
   //
   always @(posedge BFRModemClk or negedge nBFRModemRst)
     begin
       if (!nBFRModemRst)
         begin
           pO2     <= 13'd0;
           ppO2    <= 13'd0;
           pPsi31  <= 12'd0;
           ppPsi31 <= 12'd0;
         end
       else
         begin
           if (tctlAdvance)
             begin
               pO2     <= O2;
               ppO2    <= pO2;
               pPsi31  <= psi31;
               ppPsi31 <= pPsi31;
             end
         end
     end
   
   // CordPhi8 output is used only when Nr=4, but the block is always started. Its done signal starts cr10 also when Nr=3
   cordicPhase CordPhi8(.Clk(BFRModemClk), .Rst(nBFRModemRst), .Start(CordicPhi7Done), .Done(CordicPhi8Done), .RIn(pO2), .IIn(O3), .ROut(tmpR8), .POut(psi32Nr4));
   assign psi32Nr4Pos = psi32Nr4[11] ? 12'd0 :psi32Nr4;
   
   always @(*)
     begin
       if (cfgNr == NR3)
         begin
           // Use pipes on O2 and Psi31 to align timing of Nr=3 and Nr=4 cases (3 pipes added)
           R8 = {ppO2[11:0],2'b00};
           gx = 25'd2287;
           psiNr1 = ppPsi31;
         end
       else // NR4
         begin
           R8 = {tmpR8[11:0],2'b00};
           gx = 25'd941;
           psiNr1 = psi41;
        end
     end
  
  cordicRot # (.INWIDTH(14)) cr10 (.Clk(BFRModemClk), .Rst(nBFRModemRst), .Start(CordicPhi8Done), .Done(CordicCr10Done), .RIn(R8), .IIn(14'd0),  .ThetaIn(psiNr1), .ROut(FRe), .IOut());

   //
   // Delay ONr by two pipeline delays.
   // Put mult and shift logic between pipes
   //
   assign tmpOs = {{12{pONr[12]}},pONr} * gx; // [24:0]

   always @(posedge BFRModemClk or negedge nBFRModemRst)
     begin
       if (!nBFRModemRst)
         begin
           pONr    <= 13'd0;
           ppGxONr <= 13'd0;
         end
       else
         begin
           if (tctlAdvance)
             begin
               pONr    <= ONr;
               ppGxONr <= tmpOs[24:12];
             end
         end
     end

  //
  // CordPhi9 calculates psiNr2
  // 

  assign FIm = ppGxONr;
  cordicPhase CordPhi9(.Clk(BFRModemClk), .Rst(nBFRModemRst), .Start(CordicCr10Done), .Done(), .RIn(FRe), .IIn(FIm), .ROut(), .POut(psiNr2));
  assign psiNr2Pos = psiNr2[11] ? 12'd0 :psiNr2;
`endif
     
//`endif      
endmodule
