/* This file is part of q-tools, a collection of performance tools
   Copyright (c) 2003-2006 Hewlett-Packard Development Company, L.P.
   Contributed by David Mosberger-Tang <davidm@hpl.hp.com>
   Contributed by Stephane Eranian <eranian@hpl.hp.com>

   This program is free software; you can redistribute it and/or modify
   it under the terms of the GNU General Public License as published by
   the Free Software Foundation; either version 2 of the License, or
   (at your option) any later version.

   This program is distributed in the hope that it will be useful,
   but WITHOUT ANY WARRANTY; without even the implied warranty of
   MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
   GNU General Public License for more details.

   You should have received a copy of the GNU General Public License
   along with this program; if not, write to the Free Software
   Foundation, Inc., 59 Temple Place, Suite 330,
   Boston, MA  02111-1307  USA  */

#include <sys/types.h>
#include <inttypes.h>
#include <limits.h>
#include <stdio.h>
#include <stdlib.h>
#include <string.h>
#include <unistd.h>

#include "q-syscollect.h"
#include "call-counts.h"

#ifdef HAVE_EXPLICIT_PERFMON3
# include <perfmon3/pfmlib_montecito.h>
#else
# include <perfmon/pfmlib_montecito.h>
#endif

#define ETB_REGS_MASK		(  REG_MASK (38) | REG_MASK (39) \
		                 | REG_MASK (48) | REG_MASK (49) \
				 | REG_MASK (50) | REG_MASK (51) \
				 | REG_MASK (52) | REG_MASK (53) \
				 | REG_MASK (54) | REG_MASK (55) \
				 | REG_MASK (56) | REG_MASK (57) \
				 | REG_MASK (58) | REG_MASK (59) \
				 | REG_MASK (60) | REG_MASK (61) \
				 | REG_MASK (62) | REG_MASK (63))

static inline uint64_t
get_etb_ip (int regnum, pfm_mont_pmd_reg_t reg, pfm_mont_pmd_reg_t pmd38, pfm_mont_pmd_reg_t pmd39)
{
  /* We can ignore the slot-number here because functions must start
     and end at a bundle-boundary.  */
  uint64_t ip = (reg.pmd48_63_etb_mont_reg.etb_addr << 4);
  uint64_t b1;

  /*
   * the joy of the layout of PMD39
   */
  if (regnum < 8)
    b1 = (pmd39.pmd_val>>(8*regnum)) & 0x1;
  else
    b1 = (pmd39.pmd_val>>(4+8*(regnum-8))) & 0x1;

  if (b1)
    ip += 0x10;

  return ip;
}

static uint64_t
get_mont_ip_from_etb (pfm_default_smpl_entry_t *ent, char **posp)
{
  unsigned long reg, num_regs, i, j, ebi, src_addr, dst_addr;
  pfm_mont_pmd_reg_t *pmd, src, dst, pmd38, pmd39;

  pmd = (pfm_mont_pmd_reg_t *) *posp;
  *posp = (char *) &pmd[18];

  /* find the most recently recorded branch in the BTB: */
  pmd38 = pmd[0];
  pmd39 = pmd[1];

  /*
   * align to beginning of ETB
   */
  pmd += 2;

  ebi = pmd38.pmd38_mont_reg.etbi_ebi;

  if (pmd38.pmd38_mont_reg.etbi_full)
    num_regs = 16;
  else
    num_regs = ebi + 1;
  i = (ebi + 15) % 16;
  for (reg = 0; reg < num_regs; ++reg)
    {
      src = pmd[i];
      if (src.pmd48_63_etb_mont_reg.etb_s)
	{
	  /* Found the register recording the most-recent branch
	     instruction.  */

	  /* As a special case, if the most recent branch was a taken
	     branch from the last slot in a bundle to the next bundle,
	     we treat it as an "rfi" marker and ignore it.  */
	  if (src.pmd48_63_etb_mont_reg.etb_slot == 2 && reg > 0)
	    {
	      j = (i + 1) & 15;
	      dst = pmd[j];
	      src_addr = get_etb_ip (i, src, pmd38, pmd39);
	      dst_addr = get_etb_ip (j, dst, pmd38, pmd39);
	      if (dst_addr - src_addr == 0x10)
		return ent->ip | 0xd;
	    }

	  if (src.pmd48_63_etb_mont_reg.etb_slot != 3 && reg > 0)
	    {
	      /* the branch was taken and we have a record of the
		 branch destination; record that instead */
	      i = (i + 1) & 15;
	      src = pmd[i];
	    }
	  break;
	}
      i = (i + 15) % 16;
    }
  if (reg >= num_regs)
    {
      /* BTB didn't record any branches!  Use IP from interrupt
	 instead and mark bits 0-3 with "0xc" so we can distinguish
	 the sample from BTB-samples. */
      return ent->ip | 0xc;	/* or in with special marker... */
    }
  return get_etb_ip (i, src, pmd38, pmd39);
}

static char *
process_mont_etb_sample (struct addr_space *as,
		    pfm_default_smpl_entry_t *ent, char *pos)
{
  pfm_mont_pmd_reg_t pmd38, pmd39;
  uint64_t j, src_ip, dst_ip, last, interval;
  pfm_mont_pmd_reg_t *reg, src, dst;
  unsigned long weight;
  int n;

  reg = (pfm_mont_pmd_reg_t *) pos;

  interval = -ent->last_reset_val;

  /*
   * ETB index is lowest index, it is followed
   * by the ETB extension register PMD39
   */
  pmd38 = reg[0];
  pmd39 = reg[1];

  reg += 2;

  last = pmd38.pmd38_mont_reg.etbi_ebi;
  j = pmd38.pmd38_mont_reg.etbi_full ? pmd38.pmd38_mont_reg.etbi_ebi : 0;

#if 0
  printf ("first=%lu, last=%lu, pmd38=%lx, interval=%ld\n",
	  j, last, pmd38.pmd_val, interval);
  {
    int k;
    for (k = 0; k < 16; ++k)
      printf ("\treg[%d] = %016lx\n", k, reg[k].pmd_val);
  }
#endif

  /* count the number of branches recorded in the BTB: */
  n = 0;
  do
    {
      src = reg[j];
      dst = reg[(j + 1) & 15];

      if (src.pmd48_63_etb_mont_reg.etb_s)
	{
	  ++n;
	  if (!dst.pmd48_63_etb_mont_reg.etb_s)
	    j = (j + 1) & 15;
	  if (j == last)
	    break;
	}
      j = (j + 1) & 15;
    }
  while (j != last);

  j =  pmd38.pmd38_mont_reg.etbi_full ? pmd38.pmd38_mont_reg.etbi_ebi : 0;

  if (n > 0)
    {
#if 0
      n = 1;
#endif
      weight = (2*interval + n) / (2*n);

      while (n-- > 0)
	{
	  src = reg[j];
	  dst = reg[(j + 1) & 15];

	  /* Ensure "src" recorded a branch instruction (btb_b set to
	     0), not a branch target.  */
	  if (src.pmd48_63_etb_mont_reg.etb_s)
	    {
	      src_ip = get_etb_ip (j, src, pmd38, pmd39);
	      dst_ip = get_etb_ip ((j + 1) & 15, dst, pmd38, pmd39);

	      /* Note that we're capturing br.ret, so a return from
		 SRC to DST gets count as a call from DST to
		 SRC...  */
	      call_count_add (as->cc, dst_ip, src_ip, weight);
	      check_addr_space_mapping (as, src_ip);
	      check_addr_space_mapping (as, dst_ip);

	      /* If destination address was recorded by a branch
		 target entry (btb_b set to 0), skip over that entry
		 as well.  */
	      if (!dst.pmd48_63_etb_mont_reg.etb_s)
		j = (j + 1) & 15;
	    }
	  j = (j + 1) & 15;
	}
    }
  return (char *) &reg[16];	/* XXX must be 16-byte aligned??? */
}

static void
setup_mont_call_count_sampling (pfarg_reg_t *pc, int *num_pcsp,
			   pfarg_reg_t *pd, int *num_pdsp,
			   unsigned int plm, pfarg_reg_t **b_pd)
{
  int j, ret, num_pcs = *num_pcsp, num_pds = *num_pdsp;
  pfmlib_mont_input_param_t m2param;
  pfmlib_input_param_t mparam;
  pfmlib_output_param_t oparam;
  unsigned long reset_value;
  unsigned long i;


  memset (&mparam, 0, sizeof (mparam));
  memset (&m2param, 0, sizeof (m2param));

  m2param.pfp_mont_etb.etb_used = 1;
  m2param.pfp_mont_etb.etb_tm  = 0x2;	/* capture taken-branches only */
  m2param.pfp_mont_etb.etb_ptm = 0x3;	/* capture regardless of tgt pred. */
  m2param.pfp_mont_etb.etb_ppm = 0x3;	/* capture regardless of path pred. */
  m2param.pfp_mont_etb.etb_brt = 0x2;	/* capture only return branches */
  m2param.pfp_mont_etb.etb_plm = plm;

  mparam.pfp_dfl_plm = plm;
  mparam.pfp_event_count = 1;
  /* for system-wide monitoring we must use privileged monitors: */
  mparam.pfp_flags = PFMLIB_PFP_SYSTEMWIDE;

  if (pfm_find_event_byname ("ETB_EVENT", &mparam.pfp_events[0].event)
      != PFMLIB_SUCCESS)
    panic ("pfm_find_event_byname: failed to find ETB_EVENT\n");

  memset (&oparam, 0, sizeof (oparam));
  ret = pfm_dispatch_events (&mparam, &m2param, &oparam, NULL);
  if (ret != PFMLIB_SUCCESS)
    panic ("pfm_dispatch_events(): %s\n", pfm_strerror (ret));

  /* Now setup the PMC and PMD descriptors: */

  for (i = 0; i < oparam.pfp_pmc_count; ++i)
    {
      for (j = 0; j < num_pcs; ++j)
	if (oparam.pfp_pmcs[i].reg_num == pc[j].reg_num)
	  panic ("%s: PMC%d is already busy!\n", __FUNCTION__, pc[j].reg_num);
      pc[num_pcs + i].reg_num = oparam.pfp_pmcs[i].reg_num;
      pc[num_pcs + i].reg_value = oparam.pfp_pmcs[i].reg_value;
    }

  /* set the PMD reg # for BRANCH_EVENT */
  pd[num_pds].reg_num = pc[num_pcs].reg_num;
  *b_pd = pd + num_pds;

  /* Specify what happens when the BRANCH_EVENT counter wraps-around: */

  pc[num_pcs].reg_smpl_pmds[0] = ETB_REGS_MASK;
  pc[num_pcs].reg_flags |= PFM_REGFL_OVFL_NOTIFY; /* notify on BRANCH_EVENT */
  pc[num_pcs].reg_flags |= PFM_REGFL_RANDOM;	/* randomize the interval */
  /* clear BTB index (PMD38):  */
  pc[num_pcs].reg_reset_pmds[0] = REG_MASK (38);

  reset_value = -100000;
  pd[num_pds].reg_value = reset_value;		/* initial value */
  pd[num_pds].reg_long_reset = reset_value;	/* min-long-interval */
  pd[num_pds].reg_short_reset = reset_value;	/* min-short-interval */
  pd[num_pds].reg_random_seed = 0xc0ffee;	/* seed */
  pd[num_pds].reg_random_mask = 0x3ff;		/* mask */

  /* Define the reset value for PMD38: */
  pd[num_pds + 1].reg_num = 38;
  pd[num_pds + 1].reg_value = 0;
  pd[num_pds + 1].reg_long_reset = 0;
  pd[num_pds + 1].reg_short_reset = 0;

  /* Commit the new pc/pd structures: */
  *num_pcsp += oparam.pfp_pmc_count;
  *num_pdsp += 2;
}
	
static void
setup_mont_btb_code_sampling (pfarg_reg_t *pc, int *num_pcsp,
			 pfarg_reg_t *pd, int *num_pdsp,
			 unsigned int plm, pfarg_reg_t **cs_pd)
{
  int j, ret, num_pcs = *num_pcsp, num_pds = *num_pdsp;
  pfmlib_mont_input_param_t m2param;
  pfmlib_input_param_t mparam;
  pfmlib_output_param_t oparam;
  unsigned long reset_value;
  unsigned long i;

  memset (&mparam, 0, sizeof (mparam));
  memset (&m2param, 0, sizeof (m2param));

  m2param.pfp_mont_etb.etb_used = 1;
  m2param.pfp_mont_etb.etb_tm  = 0x3;	/* capture regardless whether taken */
  m2param.pfp_mont_etb.etb_ptm = 0x3;	/* capture regardless of tgt pred. */
  m2param.pfp_mont_etb.etb_ppm = 0x3;	/* capture regardless of path pred. */
  m2param.pfp_mont_etb.etb_brt = 0x0;	/* capture all branch-types */
  m2param.pfp_mont_etb.etb_plm = plm;

  mparam.pfp_dfl_plm = plm;
  mparam.pfp_event_count = 1;
  /* for system-wide monitoring we must use privileged monitors: */
  mparam.pfp_flags = PFMLIB_PFP_SYSTEMWIDE;

  mparam.pfp_events[0].event = cs_event_code;

  memset (&oparam, 0, sizeof (oparam));
  ret = pfm_dispatch_events (&mparam, &m2param, &oparam, NULL);
  if (ret != PFMLIB_SUCCESS)
    panic ("pfm_dispatch_events(): %s\n", pfm_strerror (ret));

  /* Now setup the PMC and PMD descriptors: */

  for (i = 0; i < oparam.pfp_pmc_count; ++i)
    {
      for (j = 0; j < num_pcs; ++j)
	if (oparam.pfp_pmcs[i].reg_num == pc[j].reg_num)
	  panic ("%s: PMC%d is already busy!\n", __FUNCTION__, pc[j].reg_num);
      pc[num_pcs + i].reg_num = oparam.pfp_pmcs[i].reg_num;
      pc[num_pcs + i].reg_value = oparam.pfp_pmcs[i].reg_value;
    }

  /* set the PMD reg # for code-sampling event */
  pd[num_pds].reg_num = pc[num_pcs].reg_num;
  *cs_pd = pd + num_pds;

  /* Specify what happens when the code sampling event-counter wraps-around: */

  pc[num_pcs].reg_smpl_pmds[0] = ETB_REGS_MASK;
  pc[num_pcs].reg_flags |= PFM_REGFL_OVFL_NOTIFY; /* notify on overflow */
  pc[num_pcs].reg_flags |= PFM_REGFL_RANDOM;	/* randomize the interval */
  /* clear BTB index (PMD38):  */
  pc[num_pcs].reg_reset_pmds[0] = REG_MASK (38);

  if (cs_event_code == cs_cycle_code)
    reset_value = - (long) (cycle_frequency / code_sample_rate);
  else
    reset_value = -100000;	/* your guess is as good as mine... */

  pd[num_pds].reg_value = reset_value;		/* initial value */
  pd[num_pds].reg_long_reset = reset_value;	/* min-long-interval */
  pd[num_pds].reg_short_reset = reset_value;	/* min-short-interval */
  pd[num_pds].reg_random_seed = 0xc0ffee;	/* seed */
  pd[num_pds].reg_random_mask = 0x3ff;		/* mask */

  /* Define the reset value for PMD38: */
  pd[num_pds + 1].reg_num = 38;
  pd[num_pds + 1].reg_value = 0;
  pd[num_pds + 1].reg_long_reset = 0;
  pd[num_pds + 1].reg_short_reset = 0;

  /* Commit the new pc/pd structures: */
  *num_pcsp += oparam.pfp_pmc_count;
  *num_pdsp += 2;
}

struct qsys_btb_support qsys_montecito =
  {
    .pmu_type = PFMLIB_MONTECITO_PMU,
    .setup_btb_code_sampling = setup_mont_btb_code_sampling,
    .setup_call_count_sampling = setup_mont_call_count_sampling,
    .process_btb_sample = process_mont_etb_sample,
    .get_ip_from_btb = get_mont_ip_from_etb
  };
