/*
 * Copyright (c) 2001-2002 The Trustees of Indiana University.  
 *                         All rights reserved.
 * Copyright (c) 1998-2001 University of Notre Dame. 
 *                         All rights reserved.
 * Copyright (c) 1994-1998 The Ohio State University.  
 *                         All rights reserved.
 * 
 * This file is part of the LAM/MPI software package.  For license
 * information, see the LICENSE file in the top level directory of the
 * LAM/MPI source distribution.
 * 
 * $HEADER$
 *
 * $Id: lamcid.c,v 6.14.2.2 2003/06/20 19:15:36 jsquyres Exp $
 *
 *	Function:	- manage context identifiers
 *			- trivial version
 */

#include <lam_config.h>

#include <typical.h>
#include <mpi.h>
#include <mpisys.h>
#include <rpisys.h>
#include <string.h>
#include <errno.h>


/*
 * local variables
 */
static int map_size = -1;
static unsigned char *cid_map = NULL;
static unsigned char *empty_map = NULL;


/*
 * local functions
 */
static int find_unused(unsigned char *map);
static void rmcid(int cid);


/*
 * init_cid
 *
 * Initialize the CID map
 */
int
lam_init_cid(void)
{
  map_size = lam_mpi_max_cid / sizeof(unsigned char);
  cid_map = malloc(map_size);
  if (cid_map == NULL)
    return LAMERROR;
  empty_map = malloc(map_size);
  if (empty_map == NULL) {
    free(cid_map);
    return LAMERROR;
  }

  memset(cid_map, 0, map_size);
  memset(empty_map, 0, map_size);

  return 0;
}


/*
 * find_unused
 *
 * Scan a map and find an unused CID.
 */
static int
find_unused(unsigned char *map)
{
  int i, byte, bit, step;

  /* Scan down the combined bitmap to find an open CID */

#if LAM_WANT_IMPI
  step = 3;
#else
  step = 1;
#endif
  for (i = 0; i < map_size * 8; i += step) {
    byte = (i / (sizeof(char) * 8));
    bit = (1 << (i % (sizeof(char) * 8)));

    if ((map[byte] & bit) == 0)
      return i;
  }

  /* We didn't find out.  Doh! */

  return LAMERROR;
}


/*
 *	lam_getcid
 *
 *	Function:	- get next available context ID
 *	Returns:	- context ID or LAMERROR
 */
int
lam_getcid(void)
{
  return find_unused(cid_map);
}


/*
 * Agree on a new CID for a new communicator.  Some of LAM's RPI's
 * only support a small number of CIDs (%@#$@#$ lamd RPI...), so we
 * have to be as efficient with CIDs as possible.  Hence, we tradeoff
 * communication efficiency for CID allocation efficiency.
 * Additionally, with most commodity networks these days (e.g.,
 * 10/100Mbps), the knee bend in the latency curve typically occurs
 * around 1k.  So with 4096 CIDs, that's only 512 bytes, so it's just
 * as cheap to send 512 bytes as it is to send 1 byte.  Hence, doing
 * an Allreduce of 512 bytes is about as expensive as reduce/bcast of
 * a single int.
 *
 * Unlike for intracommunicators, callers will always contribute when
 * they invoke this function.  This will change when we implement the
 * intercommunicator version of MPI_COMM_SPLIT, but that hasn't
 * happened yet.
 */
int 
lam_coll_alloc_inter_cid(MPI_Comm lcomm, int lleader,
			 MPI_Comm pcomm, int pleader,
			 int ptag, int blk, int *new_cid)
{
  int i, err;
  int local_is_inter;
  MPI_Request req;
  unsigned char *recv_map;
  unsigned char *remote_map;

  /* Allocate some temporary space */

  recv_map = malloc(map_size * 2);
  if (recv_map == NULL)
    return lam_errfunc(lcomm, blk, lam_mkerr(MPI_ERR_NO_MEM, ENOMEM));
  remote_map = recv_map + map_size;

  /* Do a reduce across my local communicator to find out what CIDs
     are available.  This may be called from MPI_COMM_DUP; if so, fake
     out the communicator to make it think we're an intracommunicator
     (so that we can do a reduce). */

  local_is_inter = LAM_IS_INTER(lcomm);
  if (local_is_inter)
    lcomm->c_flags &= ~LAM_CINTER;
  err = MPI_Reduce(cid_map, recv_map, map_size, MPI_BYTE, MPI_BOR, 
		   lleader, lcomm);
  if (local_is_inter)
    lcomm->c_flags |= LAM_CINTER;
  if (err != MPI_SUCCESS) {
    free(recv_map);
    return err;
  }

  /* If I'm the local leader, sendrecv with the remote leader */

  if (lcomm->c_group->g_myrank == lleader) {
    err = lam_irecv(remote_map, map_size, MPI_BYTE, pleader, ptag, 
		    pcomm, &req);
    if (err != MPI_SUCCESS) {
      free(recv_map);
      return err;
    }

    err = lam_send(recv_map, map_size, MPI_BYTE, pleader, ptag,
		   pcomm, LAM_RQISEND);
    if (err != MPI_SUCCESS) {
      free(recv_map);
      return err;
    }

    err = MPI_Wait(&req, MPI_STATUS_IGNORE);
    if (err != MPI_SUCCESS) {
      free(recv_map);
      return err;
    }

    /* Merge the two maps */

    for (i = 0; i < map_size; ++i)
      recv_map[i] |= remote_map[i];
  }

  /* Now do a local broadcast of the map */

  if (local_is_inter)
    lcomm->c_flags &= ~LAM_CINTER;
  err = MPI_Bcast(recv_map, map_size, MPI_BYTE, lleader, lcomm);
  if (local_is_inter)
    lcomm->c_flags |= LAM_CINTER;
  if (err != MPI_SUCCESS) {
    free(recv_map);
    return err;
  }

  /* Scan down the combined bitmap to find an open CID */

  *new_cid = find_unused(recv_map);
  free(recv_map);
  if (*new_cid < 0)
    return lam_errfunc(lcomm, blk, lam_mkerr(MPI_ERR_INTERN, EFULL));

  return MPI_SUCCESS;
}


/*
 * Agree on a new CID for a new communicator.  Same rationale as for
 * intercommunicators, above.
 *
 * Callers may or may not contribute.  For example, the caller may
 * have specified MPI_UNDEFINED for the color in MPI_COMM_SPLIT, and
 * although they still have to take part in the collective, they
 * should not contribute to the new CID value.
 */
int 
lam_coll_alloc_intra_cid(MPI_Comm comm, int contribute, int blk, int *new_cid)
{
  int err;
  unsigned char *buf;
  unsigned char *recv_map;

  if (contribute == 1)
    buf = cid_map;
  else
    buf = empty_map;

  /* Allocate space */

  recv_map = malloc(map_size);
  if (recv_map == NULL)
    return lam_errfunc(comm, blk, lam_mkerr(MPI_ERR_NO_MEM, ENOMEM));

  /* Do an all reduce over a bitwise-OR to find CIDs that everyone has
     available */

  err = MPI_Allreduce(buf, recv_map, map_size, MPI_BYTE, MPI_BOR, comm);
  if (err != MPI_SUCCESS) {
    free(recv_map);
    return err;
  }

  /* Scan down the combined bitmap to find an open CID */

  *new_cid = find_unused(recv_map);

  free(recv_map);
  if (*new_cid < 0)
    return lam_errfunc(comm, blk, lam_mkerr(MPI_ERR_INTERN, EFULL));

  return MPI_SUCCESS;
}


/*
 *	lam_setcid
 *
 *	Function:	- set highest used context ID
 *      Accepts:        - context ID 
 */
void
lam_setcid(int cid)
{
  int byte, bit;
  
  byte = cid / 8;
  bit = (1 << (cid % 8));
  cid_map[byte] |= bit;

#if LAM_WANT_IMPI
  /* Shadow/IMPI collective communicator */
  cid++;
  byte = cid / 8;
  bit = (1 << (cid % 8));
  cid_map[byte] |= bit;

  /* Datasync (positive)/Syancack (negative) communicator */
  cid++;
  byte = cid / 8;
  bit = (1 << (cid % 8));
  cid_map[byte] |= bit;

#endif
}


/*
 *	lam_rmcid
 *
 *	Function:	- deallocate (remove) a context ID
 *	Accepts:	- context ID
 */
void
lam_rmcid(int cid)
{
  rmcid(cid);
#if LAM_WANT_IMPI
  /* Free the next 3 communicators as well */
  rmcid(cid + 1);
  rmcid(cid + 2);
#endif
}


/*
 *	rmcid
 *
 *	Function:	- deallocate (remove) a context ID
 *	Accepts:	- context ID
 */
static void
rmcid(int cid)
{
  int byte, bit;
  
  byte = cid / 8;
  bit = (1 << (cid % 8));

  cid_map[byte] &= ~bit;
}


/*
 *	lam_nukecids
 *
 *	Function:	- deallocate all context IDs (cleanup)
 */
void
lam_nukecids(void)
{
  if (cid_map != NULL)
    free(cid_map);
  if (empty_map != NULL)
    free(empty_map);

  cid_map = NULL;
  empty_map = NULL;
}
