/***************************************************************************
 *
 * Copyright (c) 2000, 2001, 2002, 2003, 2004 BalaBit IT Ltd, Budapest, Hungary
 *
 * This program is free software; you can redistribute it and/or modify it
 * under the terms of the GNU General Public License version 2 as published
 * by the Free Software Foundation.
 *
 * Note that this permission is granted for only version 2 of the GPL.
 *
 * As an additional exemption you are allowed to compile & link against the
 * OpenSSL libraries as published by the OpenSSL project. See the file
 * COPYING for details.
 *
 * This program is distributed in the hope that it will be useful,
 * but WITHOUT ANY WARRANTY; without even the implied warranty of
 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
 * GNU General Public License for more details.
 *
 * You should have received a copy of the GNU General Public License
 * along with this program; if not, write to the Free Software
 * Foundation, Inc., 675 Mass Ave, Cambridge, MA 02139, USA.
 *
 * $Id$
 *
 * Author  : Fules
 * Auditor : bazsi
 * Last audited version: 
 * Notes:
 *
 ***************************************************************************/

#include <zorp/blob.h>
#include <zorp/log.h>
#include <zorp/policy.h>

#include <stdlib.h>
#include <sys/types.h>
#include <sys/stat.h>
#include <sys/mman.h>
#include <fcntl.h>
#include <errno.h>
#include <string.h>

/* Some words about the locking semantics of blobs
 *
 * - Blob system needn't be locked, because only the management thread accesses
 *   the enclosed data
 * - Blob system's
 * - Communication between blob and blob system is managed by the async queue of
 *   the blob system:
 *   - the client stores the request code (and args) in its own variables
 *   - pushes itself into the systems async queue
 *   - waits until the management thread notifies it about the completion of
 *     the request by signalling the blobs cond_reply
 * 
 * - Blobs must have a downgradeable rw-lock for being able to access the
 *   swapped-out file:
 *   - First a write-lock must be acquired for swapping out
 *   - During the file-access a read-lock must be hold to prevent fetch-ins
 *   - The transition between these ('downgrading') must be atomic to
 *     prevent race conditions (fetch-in in the gap).
 *   - So the access to the rwlock ('lock') must be synchronised via a mutex
 *     ('mtx_lock').
 */

/* Default blob system instance */
ZBlobSystem  *z_blob_system_default = NULL;

/* And its default attributes
 * (Don't worry about the names, they will be used only 3 times...)*/
const gchar *z_blob_system_default_tmpdir = "/var/lib/zorp/tmp/"; /* directory to store the blobs in */
gsize z_blob_system_default_max_disk_usage = 1024*0x100000; /* max disk usage = 1 GB */
gsize z_blob_system_default_max_mem_usage = 256*0x100000;   /* max mem usage = 256 MB */
gsize z_blob_system_default_lowat = 96*0x100000;            /* lowat = 96 MB */
gsize z_blob_system_default_hiwat = 128*0x100000;           /* hiwat = 128 MB */
gsize z_blob_system_default_noswap_max = 16384;             /* noswap_max = 16 kB */

/* local functions of blobs */
static void z_blob_detach(ZBlob *self);
static void z_blob_destroy(ZBlob *self);
static void z_blob_downgrade_lock(ZBlob *self);
static gsize z_blob_alloc(ZBlob *self, gsize req_size);
static void z_blob_swap_out(ZBlob *self);
static void z_blob_fetch_in(ZBlob *self);
static void z_blob_signal_ready(ZBlob *self);


static void Z_BLOB_THREAD_KILL(void)
{
  /* dummy magic pointer to signal that the management thread should exit */
}

/******************************************************************************
 * ZBlobRequest
 ******************************************************************************/
typedef struct ZBlobRequest
{
  ZBlob               *blob;
  ZBlobRequestCode    code;                    /* request code, processed by the blob systems threadproc */
  gboolean            approved;           /* reply from the blob systems threadproc */
  union
    {
      gssize          alloc_req;
    } u;
} ZBlobRequest;


/******************************************************************************
 * ZBlobSystem
 ******************************************************************************/

/**
 * z_blob_system_do_alloc:
 * @self: this
 * @blob: blob that asked approvement for memory allocation
 * @alloc_req: amount to allocate
 *
 * Checks if a blob may allocate @blob->req_alloc additional bytes.
 * Called only from threadproc(), so both the blob/memory management data and
 * the blob is implicitely locked.
 *
 * Returns:
 * TRUE if req granted, FALSE if denied
 */
gboolean 
z_blob_system_do_alloc(ZBlobSystem *self, ZBlob *blob, gssize alloc_req)
{
  gsize         disk_available, mem_available;
  gsize         req_total;

  mem_available = self->mem_max - self->mem_used;
  disk_available = self->disk_max - self->disk_used;
  req_total = blob->alloc_size + alloc_req;
  z_log(NULL, CORE_DEBUG, 7, "checking allocation request; blob='%p', additional_size='%d', req_total='%d', mem_avail='%d', disk_avail='%d'", 
        blob, alloc_req, req_total, mem_available, disk_available);

  if (blob->is_in_file)
    {
      z_log(NULL, CORE_DEBUG, 7, "blob is already swapped out, alloc granted;");
      self->disk_used += alloc_req;
      return TRUE;
    }
  else if ((alloc_req < 0) || ((gsize)alloc_req <= mem_available))
    {
      self->mem_used += alloc_req;
      z_log(NULL, CORE_DEBUG, 7, "blob management thread alloc granted; new_mem_used='%d'", self->mem_used);
      return TRUE;
    }
  else if (req_total <= disk_available) /* don't fit in mem but fits on disk */
    {
      /* FIXME: !!! FOR TESTING ONLY !!!
       * The current swapping policy is definitely quite lame, it must
       * be replaced with some more intelligent algorithm.
       * Now, if the blob can't be kept in memory, it goes directly to disk.
       * Then, if any blob gets freed, we try to find the most appropriate
       * one on disk that would fit in the available ram, and fetch it in.
       * The decision factor is the number of accesses divided by the
       * time elapsed since the last access. Ummm, it cries for some
       * refinement, but should work for now :)...
       */
      z_log(NULL, CORE_DEBUG, 7, "blob management thread swapping out blob; blob='%p'", blob);
      z_blob_swap_out(blob);
      self->disk_used += alloc_req;
      return TRUE;
    }
  else if (req_total < (disk_available+mem_available)) /* we have the space for it, but partly on disk */
    {
      /* Premise: if we had anything to fetch in, it would have been already
       * done. (Not quite true, best-fit candidate is fetched in. There may
       * be a better alignment, but finding it is an NP-strong problem -
       * analogous to the 'backpack' problem).
       * Now, treat this case as if there weren't enough space...*/
      return FALSE;
    }
  else /* impossible to allocate */
    {
      return FALSE;
    }
}

/**
 * z_blob_system_do_swap:
 * @self: this
 *
 * Try to fetch in blobs if there is enough space for that
 * Called only from threadproc(), so exclusive access to all memory and blob
 * management data is implicitly granted.
 */
void
z_blob_system_do_swap(ZBlobSystem *self)
{
  gsize         space_available;
  gdouble       dec_factor, dec_factor_best;
  GList         *cur;
  ZBlob         *blob, *best;
  time_t        now, elapsed;
  gint          n;

  /* when low water mark is reached, start a fetch-in until high water mark is reached */
  z_log(NULL, CORE_DEBUG, 7, "checking whether to perform fetch-in; mem_used='%d', lowat='%d'",
        self->mem_used, self->lowat);
  if (self->mem_used >= self->lowat)
      return;

  z_log(NULL, CORE_DEBUG, 7, "blob management thread looking for fetch-in candidates;");
  n = 0;
  do
    {
      time(&now);
      space_available = self->hiwat - self->mem_used;
      dec_factor_best = -1;
      best = NULL;

      for (cur = self->blobs; cur; cur = cur->next)
        {
          blob = (ZBlob *) cur->data;
          z_log(NULL, CORE_DEBUG, 7, "examining blob; blob='%p'", blob);
          if (z_blob_lock_write(blob, 0)) /* zero timeout -> trylock */
            {
              if (blob->is_in_file && (blob->alloc_size <= space_available))
                {
                  elapsed = now - blob->stat.last_accessed;
                  if (elapsed > 0)
                    {
                      dec_factor = (blob->stat.req_rd + blob->stat.req_wr) / elapsed;
                      if (dec_factor > dec_factor_best)
                        {
                          z_log(NULL, CORE_DEBUG, 7, "best candidate this far; blob='%p'", blob);
                          dec_factor_best = dec_factor;
                          best = blob;
                        }
                    }
                }
              z_blob_unlock_write(blob);
            }
        }

      if (best)
        {
          z_log(NULL, CORE_DEBUG, 7, "blob management thread fetching in blob; blob='%p'", best);
          if (z_blob_lock_write(best, 0)) /* zero timeout -> trylock */
            {
              if (best->alloc_size <= space_available)
                {
                  z_blob_fetch_in(best);
                  z_log(NULL, CORE_DEBUG, 7, "fetched in blob; blob='%p'", best);
                  n++;
                }
              z_blob_unlock_write(best);
            }
        }
    } 
  while (best);
  z_log(NULL, CORE_DEBUG, 7, "blob management fetched in %d blobs;", n);
}

/**
 * z_blob_system_check_waiting:
 * @self: this
 *
 * Check if there are satisfiable requests in the waiting queue
 */
void
z_blob_system_check_waiting(ZBlobSystem *self)
{
  GList         *cur, *del;
  ZBlobRequest  *req;
  ZBlob         *blob;

  z_log(NULL, CORE_DEBUG, 7, "blob management thread looking for satisfiable deferred reqs;");
  cur = self->waiting_list;
  while (cur)
    {
      req = (ZBlobRequest*) cur->data;
      blob = req->blob;
      z_log(NULL, CORE_DEBUG, 7, "examining blob request; req='%p', blob='%p'", req, blob);
      del = NULL;
      switch (req->code)
        {
        case Z_BLOB_REQ_ALLOC:
          z_log(NULL, CORE_DEBUG, 7, "trying to allocate blob; blob='%p'", blob);
          req->approved = z_blob_system_do_alloc(self, blob, req->u.alloc_req);
          if (req->approved)
            {
              del = cur;
              z_log(NULL, CORE_DEBUG, 7, "deferred allocation succeeded; blob='%p'", blob);
              z_blob_signal_ready(blob);
            }
          else
            {
              z_log(NULL, CORE_DEBUG, 7, "alloc still impossible; blob='%p'", blob);
            }
          break;

        default:
          break;
        }
      cur = cur->next;
      if (del)
          self->waiting_list = g_list_delete_link(self->waiting_list, del);
    }
}


/**
 * z_blob_system_threadproc:
 * @self: this
 *
 * Thread procedure of ZBlobSystem
 * Performs the swapping/storage maintenance tasks described in the spec
 *
 * Returns:
 * Currently just @self
 */
static gpointer
z_blob_system_threadproc(ZBlobSystem *self)
{
  ZBlobRequest    *req;
  ZBlob           *blob;
  gboolean        blob_ready;

  z_enter();
  g_assert(self);
  z_log(NULL, CORE_DEBUG, 7, "blob management thread starting;");
  g_mutex_lock(self->mtx_blobsys);
  z_log(NULL, CORE_DEBUG, 7, "blob management thread signalling back to constructor;");
  g_cond_signal(self->cond_thread_started);
  g_mutex_unlock(self->mtx_blobsys);
  while (1)
    {
      z_log(NULL, CORE_DEBUG, 7, "waiting for the queue;");
      /* TODO: a waiting queue must also be checked (blobs whose requests can't be approved yet) */
      req = g_async_queue_pop(self->req_queue);   /* blocks until there is a requesting blob in the queue */
      if (req == (ZBlobRequest*)Z_BLOB_THREAD_KILL)
          break;
          
      z_log(NULL, CORE_DEBUG, 7, "blob appeared in the queue;");

      blob = req->blob;
      if (!blob)
        {
          z_log(NULL, CORE_DEBUG, 7, "you don't exist, go away;");
          continue;
        }

      g_mutex_lock(self->mtx_blobsys);
      blob_ready = TRUE;
      switch (req->code)
        {
        case Z_BLOB_REQ_NONE:
          z_log(NULL, CORE_DEBUG, 7, "no request, weird but approved; blob='%p'", blob);
          req->approved = TRUE;
          break;

        case Z_BLOB_REQ_REGISTER:
          g_assert(self == blob->system);
          self->blobs = g_list_append(self->blobs, blob);
          z_log(NULL, CORE_DEBUG, 7, "registered; blob='%p'", blob);
          req->approved = TRUE;
          break;

        case Z_BLOB_REQ_UNREGISTER:
          g_assert(self == blob->system);
          self->blobs = g_list_remove(self->blobs, blob);
          z_log(NULL, CORE_DEBUG, 7, "unregistered; blob='%p'", blob);
          req->approved = TRUE;
          break;

        case Z_BLOB_REQ_ALLOC:
          z_log(NULL, CORE_DEBUG, 7, "blob management thread checking alloc req; blob='%p', additional_size='%d'", 
                blob, req->u.alloc_req);

          req->approved = z_blob_system_do_alloc(self, blob, req->u.alloc_req);

          z_log(NULL, CORE_DEBUG, 7, "reply for alloc request; blob='%p', reply='%s'", 
                blob, req->approved ? "granted" : "denied");

          /* In case of denial, move the blob to the waiting queue */
          if (!req->approved)
            {
              z_log(NULL, CORE_INFO, 4, "adding blob request to the waiting list; req='%p'", req);
              self->waiting_list = g_list_append(self->waiting_list, req);
              blob_ready = FALSE;
            }
          break;

        default:
          z_log(NULL, CORE_DEBUG, 7, "unknown request code; req='%p'; code='%d'", req, req->code);
          req->approved = FALSE;
          break;
        }

      /* send back the result to the blob */
      if (blob_ready)
          z_blob_signal_ready(blob);

      z_blob_system_check_waiting(self);
      z_blob_system_do_swap(self);
      g_mutex_unlock(self->mtx_blobsys);
    }
  z_log(NULL, CORE_DEBUG, 7, "blob management thread exiting;");
  z_leave();
  g_thread_exit(self);
  return self;
}

/**
 * z_blob_system_default_init:
 *
 * Initialise the default blob system
 */
void
z_blob_system_default_init(void)
{
  z_enter();
  if (!z_blob_system_default)
    {
      z_policy_acquire_main(current_policy);
      z_policy_var_parse_str(z_global_getattr("config.blob.default_tmpdir"), &z_blob_system_default_tmpdir);
      z_policy_var_parse_int(z_global_getattr("config.blob.default_max_disk_usage"), &z_blob_system_default_max_disk_usage);
      z_policy_var_parse_int(z_global_getattr("config.blob.default_max_mem_usage"), &z_blob_system_default_max_mem_usage);
      z_policy_var_parse_int(z_global_getattr("config.blob.default_lowat"), &z_blob_system_default_lowat);
      z_policy_var_parse_int(z_global_getattr("config.blob.default_hiwat"), &z_blob_system_default_hiwat);
      z_policy_var_parse_int(z_global_getattr("config.blob.default_noswap_max"), &z_blob_system_default_noswap_max);
      z_log(NULL, CORE_DEBUG, 7, "read blob systems default attributes; "
            "tmpdir='%s', max_disk_usage='%zd', max_mem_usage='%zd', "
            "lowat='%d', hiwat='%d', noswap_max='%d'",
            z_blob_system_default_tmpdir, z_blob_system_default_max_disk_usage,
            z_blob_system_default_max_mem_usage, z_blob_system_default_lowat,
            z_blob_system_default_hiwat, z_blob_system_default_noswap_max);

      z_blob_system_default = z_blob_system_new(z_blob_system_default_tmpdir,
                                                z_blob_system_default_max_disk_usage,
                                                z_blob_system_default_max_mem_usage,
                                                z_blob_system_default_lowat,
                                                z_blob_system_default_hiwat,
                                                z_blob_system_default_noswap_max );
      z_policy_release_main(current_policy);
    }
  z_leave();
}

/**
 * z_blob_system_default_destroy:
 *
 * Destroy the default blob system
 */
void
z_blob_system_default_destroy(void)
{
  z_enter();
  if (z_blob_system_default)
      z_blob_system_unref(z_blob_system_default);
  z_leave();
}

/**
 * z_blob_system_new:
 * @dir: directory to put the swapped blobs into
 * @dmax: max disk usage size
 * @mmax: max mem usage size
 * @low: low water mark
 * @hiw: high water mark
 * @nosw: maximal size that wont't be swapped
 *
 * Create a new blob system using the given parameters.
 *
 * Returns:
 * The new blob system instance
 */
ZBlobSystem* 
z_blob_system_new(const char* dir, gsize dmax, gsize mmax, gsize low, gsize hiw, gsize nosw)
{
  ZBlobSystem   *self;

  z_enter();
  self = g_new0(ZBlobSystem, 1);

  self->ref_cnt = 1;
  self->dir = strdup(dir);
  self->disk_max = dmax;
  self->mem_max = mmax;
  self->disk_used = self->mem_used = 0;
  if (mmax <= low)
      low = mmax - 1;
  self->lowat = low;
  if (mmax <= hiw)
      hiw = mmax - 1;
  self->hiwat = hiw;
  self->noswap_max = nosw;
  self->blobs = NULL;
  self->mtx_blobsys = g_mutex_new();
  self->cond_thread_started = g_cond_new();
  self->req_queue = g_async_queue_new();
  self->waiting_list = NULL;

  z_log(NULL, CORE_DEBUG, 7, "creating blob management thread;");
  g_mutex_lock(self->mtx_blobsys);
  /* FIXME: z_thread_new() */
  /* kontra FIXME: az csak nem-joinable-t tud, ergo amig nem reszelek valami
   * z_thread_joinable_new()-szeruseget, addig hagyjuk igy inkabb */
  self->thr_management = g_thread_create((GThreadFunc)z_blob_system_threadproc,
                              (gpointer)self, TRUE, &self->thread_error);
  g_cond_wait(self->cond_thread_started, self->mtx_blobsys);
  g_mutex_unlock(self->mtx_blobsys);
  z_log(NULL, CORE_DEBUG, 7, "blob management thread up and running;");
	self->active = TRUE;
  z_leave();
  return self;
}

/**
 * z_blob_system_destroy:
 * @self: this
 *
 * Destroy a blob system and its blobs.
 */
static void
z_blob_system_destroy(ZBlobSystem *self)
{
  GList         *cur, *next;
  ZBlobRequest  *req;

  z_enter();
  g_assert(self); 
  g_assert(self->ref_cnt == 0);
  z_log(NULL, CORE_DEBUG, 7, "destroying blob system; blobsys='%p'", self);
	self->active = FALSE;

	/* FIXME: itt lockolni kell */
  z_log(NULL, CORE_DEBUG, 7, "stopping the blob system thread;");
  g_async_queue_push(self->req_queue, Z_BLOB_THREAD_KILL);
  g_thread_join(self->thr_management);

  z_log(NULL, CORE_DEBUG, 7, "refusing deferred reqs;");
  for (cur = self->waiting_list; cur; cur = next)
    {
      next = cur->next;
      req = (ZBlobRequest*) cur->data;
      if (req->code == Z_BLOB_REQ_ALLOC)
				{
          z_log(NULL, CORE_DEBUG, 7, "refusing to allocate blob; blob='%p'", req->blob);
          req->approved = FALSE;
					z_blob_signal_ready(req->blob);
        }
			self->waiting_list = g_list_delete_link(self->waiting_list, cur);
    }

  z_log(NULL, CORE_DEBUG, 7, "detaching blobs;");
	for (cur = self->blobs; cur; cur = next)
		{
			next = cur->next;
			z_blob_detach((ZBlob*) cur->data);
			self->blobs = g_list_delete_link(self->blobs, cur);
		}
	/* FIXME: unlockolhat= */
      
  z_log(NULL, CORE_DEBUG, 7, "destroying instance;");
  if (self->dir)
      g_free(self->dir);
  g_mutex_free(self->mtx_blobsys);
  g_cond_free(self->cond_thread_started);
  g_async_queue_unref(self->req_queue);
  g_list_free(self->waiting_list);
  g_free(self);

  z_log(NULL, CORE_DEBUG, 7, "blob system down;");
  z_leave();
}

void
z_blob_system_ref(ZBlobSystem *self)
{
  z_enter();
  g_mutex_lock(self->mtx_blobsys);
  z_incref(&self->ref_cnt); 
  g_mutex_unlock(self->mtx_blobsys);
  z_leave();
}

void
z_blob_system_unref(ZBlobSystem *self)
{
  gboolean  killme;
  
  z_enter();
  g_mutex_lock(self->mtx_blobsys);
  killme = (z_decref(&self->ref_cnt) == 0);
  g_mutex_unlock(self->mtx_blobsys);
  if (killme)
      z_blob_system_destroy(self);
  z_leave();
}



/******************************************************************************
 * ZBlobStatistic
 ******************************************************************************/
/**
 * z_blob_statistic_init:
 * @self: this
 *
 * 
 */
void
z_blob_statistic_init(ZBlobStatistic *self)
{
  g_assert(self);
  self->req_rd = self->req_wr = self->swap_count = self->alloc_count = 0;
  self->total_rd = self->total_wr = 0;
  self->created = self->last_accessed = time(NULL);
}

/******************************************************************************
 * ZBlob
 ******************************************************************************/

/**
 * z_blob_notify_system:
 * @self: this
 * @req_code: the code of the request
 * @arg: pointer to argument (optional)
 *
 * Wake up the management thread of the blob system, to signal a
 * request to it.
 * Returns:
 * TRUE on success
 */
gboolean
z_blob_notify_system(ZBlob *self, ZBlobRequestCode code, gpointer arg)
{
  ZBlobRequest      req;

	if (!self->system)
			return FALSE;

  req.blob = self;
  req.code = code;
  req.approved = FALSE;
  switch (code)
    {
    case Z_BLOB_REQ_ALLOC:
      req.u.alloc_req = *(gsize*)arg;
      break;

    default:
      break;
    }
  z_log(NULL, CORE_DEBUG, 7, "notifying system; code='%d'", code);
  g_mutex_lock(self->mtx_reply);
  g_async_queue_push(self->system->req_queue, &req);
  g_cond_wait(self->cond_reply, self->mtx_reply);
  g_mutex_unlock(self->mtx_reply);
  z_log(NULL, CORE_DEBUG, 7, "system notified; result='%d'", req.approved);
  return req.approved;
}

/**
 * z_blob_signal_ready:
 * @self: this
 *
 * Signal the completion of a request. Called only from z_blob_system_threadproc().
 */
static void
z_blob_signal_ready(ZBlob *self)
{
  g_mutex_lock(self->mtx_reply);
  g_cond_signal(self->cond_reply);
  g_mutex_unlock(self->mtx_reply);
}


/**
 * z_blob_new:
 * @sys: Blob system to create the blob into
 * @initial_size: Initial size to allocate.
 *
 * Create a new blob. If @sys is NULL, z_blob_system_default will be used.
 *
 * Returns:
 * The new blob instance
 */
ZBlob*
z_blob_new(ZBlobSystem *sys, gsize initial_size)
{
  ZBlob   *self;

  z_enter();
  if (!sys)
    sys = z_blob_system_default;

  if (!sys) 
		{
			z_log(NULL, CORE_DEBUG, 7, "blob system NOT initialised;");
			z_leave();
			return NULL;
		}

  if (!sys->active) 
		{
			z_log(NULL, CORE_DEBUG, 7, "blob system is NOT active;");
			z_leave();
			return NULL;
		}

  self = g_new0(ZBlob, 1);
  self->system = sys;

  self->filename = g_malloc0(strlen(self->system->dir) + 13);
  sprintf(self->filename, "%s/blob_XXXXXX", self->system->dir);
  self->fd = mkstemp(self->filename);

	if (self->fd < 0)
		{
			z_log(NULL, CORE_ERROR, 2, "can't create blob file: error='%s'", strerror(errno));
			g_free(self->filename);
			g_free(self);
			return NULL;
		}
	
  self->ref_cnt = 1;
  self->access = Z_BLOB_STORE;
  self->mode = Z_BLOB_RANDOM;
  self->size = 0;
  self->alloc_size = 0;
  self->data = g_new0(gchar, self->alloc_size);
  self->is_in_file = FALSE;
  self->mtx_reply = g_mutex_new();
  self->cond_reply = g_cond_new();
  self->mapped_ptr = NULL;
  self->mapped_length = 0;

  z_blob_statistic_init(&self->stat);
  self->mtx_lock = g_mutex_new();
  g_static_rw_lock_init(&(self->lock));

  z_log(NULL, CORE_DEBUG, 7, "registering; blob='%p'", self);
  
  if (!z_blob_notify_system(self, Z_BLOB_REQ_REGISTER, NULL))
    z_log(NULL, CORE_DEBUG, 7, "blob system didn't register; blob='%p'", self);

  if (initial_size > 0)
    z_blob_alloc(self, initial_size);

  z_leave();
  return self;
}

ZBlob*
z_blob_ref(ZBlob *self)
{
  z_enter();
  z_blob_lock_write(self, -1); /* infinite timeout -> blocking */
  z_incref(&self->ref_cnt);
  z_blob_unlock_write(self);
  z_leave();
	return self;
}

void
z_blob_unref(ZBlob *self)
{
  z_enter();
  z_blob_lock_write(self, -1); /* infinite timeout -> blocking */
  
  if (!z_decref(&self->ref_cnt))
    {
      z_blob_unlock_write(self);
      z_blob_destroy(self);
    }
  else
      z_blob_unlock_write(self);
  z_leave();
}

/**
 * z_blob_detach:
 * @self: 
 *
 * Deataches from its file and from the blob system
 */
static void
z_blob_detach(ZBlob *self)
{

	gchar		*tmp_filename;
	
  z_enter();
  g_assert(self);
	tmp_filename = self->filename;
	if (tmp_filename)
		{
			self->filename = NULL;
			unlink(tmp_filename);
			g_free(tmp_filename);
		}
	self->system = NULL;
  z_leave();
}

/**
 * z_blob_destroy:
 * @self: this
 *
 * Destroy a blob.
 */
static void
z_blob_destroy(ZBlob *self)
{
  z_enter();
  g_assert(self);
  g_assert(self->ref_cnt == 0); /* anyway - coredump is better than silent failure */

  if (!z_blob_notify_system(self, Z_BLOB_REQ_UNREGISTER, NULL))
    z_log(NULL, CORE_DEBUG, 7, "blob system didn't unregister; blob='%p'", self);

  if (self->data)
    g_free(self->data);

  close(self->fd);
  if (self->filename)
		{
			z_log(NULL, CORE_DEBUG, 7, "deleting blob file; filename='%s'", self->filename);
			if (unlink(self->filename))
					z_log(NULL, CORE_DEBUG, 7, "unlink() failed; error='%s'", strerror(errno));
			g_free(self->filename);
			self->filename = NULL;
		}

  g_mutex_free(self->mtx_reply);
  g_cond_free(self->cond_reply);
  g_static_rw_lock_free(&(self->lock));
  g_mutex_free(self->mtx_lock);
  g_free(self);
  z_leave();
}


/**
 * z_blob_lock_rdwr:
 * @self: this
 * @timeout:
 * @op_wr: TRUE for write-locking
 *
 * Lock a blob for reading or writing
 */
static gboolean
z_blob_lock_rdwr(ZBlob *self, gint timeout, gboolean op_wr)
{
  gboolean        res, outer_lock_got;
  struct timeval  tvnow, tvfinish;

  z_enter();
  g_assert(self);

  if (timeout < 0)        /* infinite timeout -> blocking mode */
    {
      g_mutex_lock(self->mtx_lock);
      (op_wr ? g_static_rw_lock_writer_lock : g_static_rw_lock_reader_lock)(&(self->lock));
      g_mutex_unlock(self->mtx_lock);
      res = TRUE;
    }
  else if (timeout == 0)  /* zero timeout -> nonblocking mode */
    {
      res = g_mutex_trylock(self->mtx_lock);
      if (res)
        {
          res = (op_wr ? g_static_rw_lock_writer_trylock : g_static_rw_lock_reader_trylock)(&(self->lock));
          g_mutex_unlock(self->mtx_lock);
        }
    }
  else                    /* positive timeout */
    {
      gettimeofday(&tvfinish, NULL);
      tvfinish.tv_sec += (timeout / 1000);
      tvfinish.tv_usec += 1000 * (timeout % 1000);
      tvfinish.tv_sec += (tvfinish.tv_usec / 1000000);
      tvfinish.tv_usec %= 1000000;
      outer_lock_got = FALSE;
      /* FIXME: maybe g_cond_wait_timed_wait ? */
      do
        {
					res = FALSE;
          if (!outer_lock_got)
              outer_lock_got = g_mutex_trylock(self->mtx_lock);
          if (outer_lock_got)
            {
              res = (op_wr ? g_static_rw_lock_writer_trylock : g_static_rw_lock_reader_trylock)(&(self->lock));
              if (res)
                  break;
            }
          usleep(1000);
          gettimeofday(&tvnow, NULL);
        } 
      while ((tvnow.tv_sec < tvfinish.tv_sec) ||
             ((tvnow.tv_sec == tvfinish.tv_sec) && (tvnow.tv_usec < tvfinish.tv_usec)));

      if (outer_lock_got)
          g_mutex_unlock(self->mtx_lock);
    }

  z_leave();
  return res;
}

/**
 * z_blob_unlock_write:
 * @self: this
 *
 * Unlock a blob locked for writing
 */
void
z_blob_unlock_write(ZBlob *self)
{
  z_enter();
  g_assert(self);
  g_static_rw_lock_writer_unlock(&(self->lock));
  z_leave();
}

/**
 * z_blob_lock_write:
 * @self: this
 *
 * Lock a blob for writing
 */
gboolean
z_blob_lock_write(ZBlob *self, gint timeout)
{
  return z_blob_lock_rdwr(self, timeout, TRUE);
}

/**
 * z_blob_lock_read:
 * @self: this
 *
 * Lock a blob for reading
 */
gboolean
z_blob_lock_read(ZBlob *self, gint timeout)
{
  return z_blob_lock_rdwr(self, timeout, FALSE);
}


/**
 * z_blob_downgrade_lock:
 * @self: 
 *
 * 
 */
static void
z_blob_downgrade_lock(ZBlob *self)
{
  z_enter();
  g_assert(self);
  g_mutex_lock(self->mtx_lock);
  g_static_rw_lock_writer_unlock(&(self->lock));
  g_static_rw_lock_reader_lock(&(self->lock));
  g_mutex_unlock(self->mtx_lock);
  z_leave();
}


/**
 * z_blob_unlock_read:
 * @self: this
 *
 * Unlock a blob locked for reading
 */
void
z_blob_unlock_read(ZBlob *self)
{
  z_enter();
  g_assert(self);
  g_static_rw_lock_reader_unlock(&self->lock);
  z_leave();
}

/**
 * z_blob_set_access_type:
 * @self: this
 * @access: required access type
 *
 * Gives a hint on the blob's access type
 */
void
z_blob_set_access_type(ZBlob *self, ZBlobAccessType access)
{
  z_enter();
  g_assert(self);
  z_blob_lock_write(self, -1);  /* infinite timeout -> blocking */
  self->access = access;
  z_blob_unlock_write(self);
  z_leave();
}

/**
 * z_blob_get_access_type:
 * @self: this
 *
 * Gets the current access type of the blob
 *
 * Returns:
 * The current access type
 */
ZBlobAccessType
z_blob_get_access_type(ZBlob *self)
{
  ZBlobAccessType     access;

  z_enter();
  g_assert(self);
  z_blob_lock_read(self, -1); /* infinite timeout -> blocking */
  access = self->access;
  z_blob_unlock_read(self);
  z_leave();
  return access;
}

/**
 * z_blob_set_access_mode:
 * @self: this
 * @mode: required access mode
 *
 * Set the access mode of the blob.
 */
void
z_blob_set_access_mode(ZBlob *self, ZBlobAccessMode mode)
{
  z_enter();
  g_assert(self);
  z_blob_lock_write(self, -1);  /* infinite timeout -> blocking */
  self->mode = mode;
  z_blob_unlock_write(self);
  z_leave();
}

/**
 * z_blob_get_access_mode:
 * @self: this
 *
 * Get the current access mode of the blob
 *
 * Returns:
 * The current access mode
 */
ZBlobAccessMode
z_blob_get_access_mode(ZBlob *self)
{
  ZBlobAccessMode   mode;

  z_enter();
  g_assert(self);
  z_blob_lock_read(self, -1); /* infinite timeout -> blocking */
  mode = self->mode;
  z_blob_unlock_read(self);
  z_leave();
  return mode;
}

/**
 * z_blob_tighten:
 * @self: this
 *
 * Shrinks the allocated space exactly to the used amount
 * Called only from the management thread to gain some space in _really_ tight
 * situations.
 */
void
z_blob_tighten(ZBlob *self)
{
  gsize         amount_free;

	if (!self->system)
			return;

  amount_free = self->alloc_size - self->size;
  if (amount_free <= 0)
    return;

  if (self->is_in_file)
    {
      self->system->disk_used -= amount_free;
      ftruncate(self->fd, self->size);
    }
  else
    {
      self->system->mem_used -= amount_free;
      self->data = g_renew(gchar, self->data, self->size);
    }
}

/**
 * z_blob_alloc:
 * @self: this
 * @req_size: required space
 *
 * Allocates space for the blob (not necessarily in memory!)
 * !!! Caller shall hold a write lock on the blob !!!
 *
 * Returns:
 * The space allocated (now ==@req_size)
 */
static gsize
z_blob_alloc(ZBlob *self, gsize req_size)
{
  gchar         *newdata;
  gint          err;
  gsize         req_alloc_size, alloc_req;

  z_enter();
  g_assert(self);

  z_log(NULL, CORE_DEBUG, 7, "current allocation; allocated='%d', used='%d', requested='%d'", 
        self->alloc_size, self->size, req_size);

  /* determine the allocation size */
  if (self->alloc_size <= 0)
    {
      req_alloc_size = req_size;
    }
  else 
    {
      /* First run (if shrinking reqd): go just below the requested size */
      req_alloc_size = self->alloc_size;
      while (req_alloc_size > req_size)
        {
          req_alloc_size >>= 1;
          z_log(NULL, CORE_DEBUG, 7, "prev alloc step; size='%d'", req_alloc_size);
        }

      /* Second run: find next available size */  
      while (req_alloc_size < req_size)
        {
          req_alloc_size<<=1;
          z_log(NULL, CORE_DEBUG, 7, "next alloc step; size='%d'", req_alloc_size);
        }
      z_log(NULL, CORE_DEBUG, 7, "final alloc step; size='%d'", req_alloc_size);
    }

  /* just return if the allocation needn't change */
  if (req_alloc_size == self->alloc_size)
    {
      self->size = req_size;
      return self->size;
    }

  z_log(NULL, CORE_DEBUG, 7, "asking permission to allocate; blob='%p', size='%d'", self, req_alloc_size);

  alloc_req = req_alloc_size - self->size;
  if (z_blob_notify_system(self, Z_BLOB_REQ_ALLOC, &alloc_req))
    {
      z_log(NULL, CORE_DEBUG, 7, "permission granted; blob='%p'", self);
      if (self->is_in_file)
        {
          err = ftruncate(self->fd, req_alloc_size);
          if (err<0)
              z_log(NULL, CORE_DEBUG, 7, "ftruncate() failed;");
        }
      else
        {
          newdata = g_renew(gchar, self->data, req_alloc_size);
          if (!newdata) 
              z_log(NULL, CORE_DEBUG, 7, "g_renew() failed;");
          if (self->alloc_size < req_alloc_size)
              memset(newdata + self->alloc_size, 0, req_alloc_size - self->alloc_size);
          self->data = newdata;
        }
      self->alloc_size = req_alloc_size;
      self->size = req_size;
      self->stat.alloc_count++;
      self->stat.last_accessed = time(NULL);
    }
  else
    {
      z_log(NULL, CORE_DEBUG, 7, "permission denied; blob='%p'", self);
    }
  z_leave();
  return req_size;
}

/**
 * z_blob_truncate:
 * @self: this
 * @pos: position to truncate at
 * @timeout: timeout
 *
 * Truncates/expands a blob
 *
 * Returns:
 * TRUE on success
 */
gboolean
z_blob_truncate(ZBlob *self, gsize pos, gint timeout)
{
  gboolean      res;

  z_enter();
  g_assert(self);
  if (z_blob_lock_write(self, timeout))
    {
      z_blob_alloc(self, pos);
      z_blob_unlock_write(self);
      res = TRUE;
    }
  else
    {
      res = FALSE;
    }

  z_leave();
  return res;
}


/**
 * z_blob_add_copy:
 * @self: this
 * @pos: position to write to
 * @data: data to write
 * @req_datalen: length of @data
 * @timeout: timeout
 *
 * Write some data into the given position of the blob, expanding it if necessary.
 *
 * Returns:
 * The amount of data written.
 */
gsize
z_blob_add_copy(ZBlob *self, gsize pos, const gchar* data, gsize req_datalen, gint timeout)
{
  gint          err;
  gssize        written;

  z_enter();
  g_assert(self);
  g_assert(data);
  if (z_blob_lock_write(self, timeout))
    {
      if (self->size<(pos + req_datalen))
        z_blob_alloc(self, pos+req_datalen);
      if (self->is_in_file)
        {
          err = lseek(self->fd, pos, SEEK_SET);
          
          if (err < 0)
              z_log(NULL, CORE_ERROR, 2, "lseek() failed;");
          written = write(self->fd, data, req_datalen);
          if (written < 0)
              z_log(NULL, CORE_ERROR, 2, "write() failed;");
          if (written != (gssize)req_datalen)
              z_log(NULL, CORE_ERROR, 2, "write() partially failed;");
        }
      else
        {
          //z_log(NULL, CORE_DEBUG, 7, "writing %d bytes from %p to %p", req_datalen, data, self->data+pos);
          memmove(self->data + pos, data, req_datalen);
        }
      self->stat.req_wr++;
      self->stat.total_wr += req_datalen;
      self->stat.last_accessed = time(NULL);
      z_blob_unlock_write(self);
    }
  else
    {
      req_datalen = 0;
    }

  z_leave();
  return req_datalen;
}

/**
 * z_blob_get_copy:
 * @self: this
 * @pos: position to read from
 * @data: buffer to read into
 * @req_datalen: bytes to read
 * @timeout: timeout
 *
 * Reads some data from the blob into a buffer.
 *
 * Returns:
 * The amount of data actually read.
 */
gsize
z_blob_get_copy(ZBlob *self, gsize pos, gchar* data, gsize req_datalen, gint timeout)
{
  gint          err;
  gssize        rd;

  z_enter();
  g_assert(self);
  g_assert(data);
  if (req_datalen > (self->size - pos))
    req_datalen = self->size - pos;
  if (z_blob_lock_read(self, timeout))
    {
      if (self->is_in_file)
        {
          err = lseek(self->fd, pos, SEEK_SET);
          if (err < 0)
            z_log(NULL, CORE_ERROR, 2, "lseek() failed;");
          rd = read(self->fd, data, req_datalen);
          if (rd < 0)
            z_log(NULL, CORE_ERROR, 2, "read() failed;");
          if (rd != (gssize)req_datalen)
            z_log(NULL, CORE_ERROR, 2, "read() partially failed;");
        }
      else
        {
          //z_log(NULL, CORE_DEBUG, 7, "reading %d bytes from %p to %p", req_datalen, self->data+pos, data);
          memmove(data, self->data + pos, req_datalen);
        }
      self->stat.req_rd++;
      self->stat.total_rd += req_datalen;
      self->stat.last_accessed = time(NULL);
      z_blob_unlock_read(self);
    }
  else
    {
      req_datalen = 0;
    }

  z_leave();
  return req_datalen;          
}

/**
 * z_blob_commit:
 * @self: this
 *
 * Commit pending changes to the on-disk copy of the blob.
 * !Currently not used!
 */
void
z_blob_commit(ZBlob *self G_GNUC_UNUSED)
{
  z_enter();
  /* FIXME: implement */
  g_assert(0);
  z_leave();
}

/**
 * z_blob_get_filename:
 * @self: this
 * @timeout: timeout
 *
 * Get the (absolut) filename assigned to the blob.
 *
 * Returns:
 * The filename
 */
const gchar* 
z_blob_get_file(ZBlob *self, gint timeout)
{
  const gchar   *res;

  z_enter();
  g_assert(self);
 
	if (!self->filename || !self->system)
			return NULL;

  g_mutex_lock(self->system->mtx_blobsys); /* swap_out() accesses the blob systems data
                                              directly, so it needs to be locked */
  if (z_blob_lock_write(self, timeout))
    {
      z_blob_swap_out(self);
      g_mutex_unlock(self->system->mtx_blobsys);
      z_blob_downgrade_lock(self); 
      res = self->filename;
    }
  else
    {
      res = NULL;
    }

  z_leave();
  return res; 
}


void
z_blob_release_file(ZBlob *self)
{
  z_enter();
  g_assert(self);
  z_blob_unlock_read(self);
  z_leave();
}


/**
 * z_blob_swap_out:
 * @self: this
 *
 * Writes a blob out to disk, called only from z_blob_system_threadproc()
 * !!! Caller must hold a lock BOTH on the blob AND the blob system !!!
 */
static void
z_blob_swap_out(ZBlob *self)
{
  gint          err;
  gssize        written;

  z_enter();
  g_assert(self);
  if (!self->is_in_file && self->system)
    {
      err = lseek(self->fd, 0, SEEK_SET);
      if (err < 0)
          z_log(NULL, CORE_ERROR, 2, "lseek() failed;");
      written = write(self->fd, self->data, self->size);
      if (written < 0)
          z_log(NULL, CORE_ERROR, 2, "write() failed;");
      if (written != (gssize)self->size)
          z_log(NULL, CORE_ERROR, 2, "write() partially failed;");
      self->is_in_file = 1;
      g_free(self->data);
      self->data = NULL;
      self->stat.swap_count++;
      self->stat.last_accessed = time(NULL);
      self->system->mem_used -= self->alloc_size;
      self->system->disk_used += self->alloc_size;
    }
  z_leave();
}

/**
 * z_blob_fetch_in:
 * @self: this
 *
 * Reads a blob from disk, called only from z_blob_system_threadproc()
 * !!! Caller must hold a lock BOTH on the blob AND the blob system !!!
 */
static void
z_blob_fetch_in(ZBlob *self)
{
  gint          err;
  gssize        rd;

  z_enter();
  g_assert(self);
  if (self->is_in_file && self->system)
    {
      err = lseek(self->fd, 0, SEEK_SET);
      if (err < 0)
        z_log(NULL, CORE_DEBUG, 7, "lseek() failed;");
      self->data = g_new0(gchar, self->size);
      if (!self->data)
        z_log(NULL, CORE_DEBUG, 7, "g_new0() failed;");
      rd = read(self->fd, self->data, self->size);
      if (rd < 0)
        z_log(NULL, CORE_DEBUG, 7, "read() failed;");
      if (rd != (gssize)self->size)
        z_log(NULL, CORE_DEBUG, 7, "read() partially failed;");
      self->is_in_file = 0;
      err = ftruncate(self->fd, 0);
      if (err < 0)
          z_log(NULL, CORE_DEBUG, 7, "ftruncate() failed;");
      self->stat.last_accessed = time(NULL);
      self->system->disk_used -= self->alloc_size;
      self->system->mem_used += self->alloc_size;
    }
  z_leave();
}

/**
 * z_blob_get_ptr:
 * @self: this
 * @pos: start of the range to get ptr for
 * @req_datalen: length of the range: in=requested, out=mapped
 * @timeout: timeout
 *
 * Obtains a pointer to a subrange of the blob.
 * Until the pointer is freed by 'z_blob_free_ptr()', the blob will be locked for
 * reading, that means read operations are still possible, but writes and
 * swapping is disabled and will block!
 *
 * Returns:
 * The pointer on success, NULL on error
 */
gchar*
z_blob_get_ptr(ZBlob *self, gsize pos, gsize *req_datalen, gint timeout)
{
  gchar             *data;

  z_enter();
  g_assert(self);
  g_assert(req_datalen);
  if ((self->size > 0) && z_blob_lock_read(self, timeout))
    {
      if (self->size < (pos + *req_datalen))
        *req_datalen = self->size - pos;

      if (self->is_in_file)
        {
          data = (gchar*)mmap(NULL, *req_datalen, PROT_READ | PROT_WRITE, MAP_SHARED, self->fd, pos);
          if (data == (gchar*)-1)
						{
              data = NULL;
							z_blob_unlock_read(self);
						}
        }
      else
        {
          data = self->data + pos;
        }
    }
  else
    {
      data = NULL;
    }

  if (data)
    {
      self->mapped_ptr = data;
      self->mapped_length = *req_datalen;
    }
  
  z_leave();
  return data;
}

/**
 * z_blob_free_ptr:
 * @self: this
 * @data: Pointer to a range, obtained by 'z_blob_get_ptr()'
 * @req_datalen: _The same_ length as specified for 'z_blob_get_ptr()'
 *
 * Unlocks a blob locked by 'z_blob_get_ptr()'.
 */
void
z_blob_free_ptr(ZBlob *self, gchar *data)
{
  z_enter();
  g_assert(self);
  g_assert(self->mapped_ptr);
  g_assert(self->mapped_ptr == data);
  g_assert(self->mapped_length > 0);

  if (self->is_in_file)
    munmap(data, self->mapped_length);

  self->mapped_ptr = NULL;
  self->mapped_length = 0;

  z_blob_unlock_read(self);

  z_leave();
}

/*
 * TODO: optimisation of pre-allocation
 * Reason: simply doubling of pre-allocated space wastes the address range:
 * |- x -|
 *       |--- 2x ---|
 *                  |-------- 4x --------|
 * etc. There's no way to use the deallocated address space, because, for
 * example the next allocation of 8x can't fit in the x+2x = 3x hole.
 * 
 * To achieve this, a growth factor less than 2 must be used:
 * |- x -|
 *       |- (p^1)*x -|
 *                   |---- (p^2)*x ----|
 * |---- (p^3)*x ----|
 * 
 * The exact value of p is the solution of the equation
 *             p^3 = p + 1
 *     p^3 - p - 1 = 0
 *
 * which is (cbrt(x) ::= the cubic root of x)
 *
 *    p <= cbrt( (1+sqrt(23/27))/2 ) + cbrt( (1-sqrt(23/27))/2 ) 
 *    p =~= 1.3247179572447458
 * 
 * This can be approximated quite well by
 *     p = 1 + 1/3 - 1/116 
 * and its reciprocal can be approximated by
 *   1/p = 1 - 1/4 - 1/205
 * 
 * Unfortunately, using integers the rounding error accumulates, and this has
 * two bad side-effects:
 *  - sometimes the calculated next allocation size is off the range by one
 *  - growing and shrinking back is also off by one in some cases
 *  
 * Because this optimisation affects only allocations comparable to the size of
 * the address space (>= 4 GB), currently I use simple doubling, but this must
 * be fixed some time.
 */
