/*
 *   (C) Copyright IBM Corp. 2001, 2003
 *
 *   This program is free software;  you can redistribute it and/or modify
 *   it under the terms of the GNU General Public License as published by
 *   the Free Software Foundation; either version 2 of the License, or
 *   (at your option) any later version.
 *
 *   This program is distributed in the hope that it will be useful,
 *   but WITHOUT ANY WARRANTY;  without even the implied warranty of
 *   MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See
 *   the GNU General Public License for more details.
 *
 *   You should have received a copy of the GNU General Public License
 *   along with this program;  if not, write to the Free Software
 *   Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA 02111-1307 USA
 *
 * Module: mdregmgr
 * File: raid0_discover.c
 *
 * Description: This file contains all functions related to the initial
 *              discovery of raid0 MD physical volumes and logical
 *              volumes.
 */

#include <stdlib.h>
#include <stdio.h>
#include <string.h>
#include <errno.h>
#include <plugin.h>

#include "md.h"
#include "raid0_mgr.h"
#include "raid0_discover.h"

#define my_plugin_record raid0_plugin

static int create_strip_zones (md_volume_t * volume)
{
	int rc = 0;
	int i, c, j, j1, j2, cur;
	u_int64_t current_offset;
	u_int64_t curr_zone_offset;
	u_int64_t size;
	u_int64_t zone0_size;
	raid0_conf_t * conf = mdvol_to_conf(volume);
	storage_object_t * child_object;
	storage_object_t * child_object1;
	storage_object_t * child_object2;
	storage_object_t * smallest_child;
	unsigned int chunk_size_in_sectors;

	LOG_ENTRY();

	// if this volume is corrupt, can't build the stripes correctly, so just return
	if (volume->flags & MD_CORRUPT) {
		LOG_EXIT_INT(0);
		return 0;
	}

	chunk_size_in_sectors = volume->super_block->chunk_size >> EVMS_VSECTOR_SIZE_SHIFT;
	/*
	 * The number of 'same size groups'
	 */

	conf->nr_strip_zones = 0;

	for (j1 = 0; j1 < volume->nr_disks; j1++) {
		child_object1 = volume->child_object[j1];
		LOG_DEBUG("Looking at %s\n", child_object1->name);
		c = 0;

		for (j2 = 0; j2 < volume->nr_disks; j2++) {
			child_object2 = volume->child_object[j2];
			LOG_DEBUG("Comparing %s(%"PRIu64") with %s(%"PRIu64")\n", child_object1->name, MD_CHUNK_ALIGN_NEW_SIZE_SECTORS(chunk_size_in_sectors, child_object1->size), child_object2->name, MD_CHUNK_ALIGN_NEW_SIZE_SECTORS(chunk_size_in_sectors, child_object2->size));
			if (child_object2 == child_object1) {
				LOG_DEBUG("  END\n");
				break;
			}
			if (MD_CHUNK_ALIGN_NEW_SIZE_SECTORS(chunk_size_in_sectors, child_object2->size) == MD_CHUNK_ALIGN_NEW_SIZE_SECTORS(chunk_size_in_sectors, child_object1->size)) {
				/*
				 * Not unique, dont count it as a new
				 * group
				 */
				LOG_DEBUG("  EQUAL\n");
				c = 1;
				break;
			}
			LOG_DEBUG("  NOT EQUAL\n");
		}
		if (!c) {
			LOG_DEBUG("  ==> UNIQUE\n");
			conf->nr_strip_zones++;
			LOG_DEBUG("  %d zones\n", conf->nr_strip_zones);
		}
	}
	LOG_DEBUG("FINAL %d zones\n", conf->nr_strip_zones);

	conf->strip_zone = EngFncs->engine_alloc(sizeof(struct strip_zone) * conf->nr_strip_zones);

	if (!conf->strip_zone) {
		LOG_CRITICAL("Error %d allocating memory for strip zone structures.\n", rc);
		LOG_EXIT_INT(rc);
		return rc;
	}

	conf->smallest_zone = NULL;
	current_offset = 0;
	curr_zone_offset = 0;

	for (i = 0; i < conf->nr_strip_zones; i++) {
		struct strip_zone *zone = conf->strip_zone + i;

		LOG_DEBUG("Zone %d\n", i);
		zone->dev_offset = current_offset;
		smallest_child = NULL;
		c = 0;

		for (j = 0; j < volume->nr_disks; j++) {

			child_object = volume->child_object[j];
			LOG_DEBUG("  checking %s ...\n", child_object->name);
			if (MD_CHUNK_ALIGN_NEW_SIZE_SECTORS(chunk_size_in_sectors, child_object->size) > current_offset) {
				LOG_DEBUG("  contained as device %d\n", c);
				zone->dev[c] = child_object;
				c++;
				if (!smallest_child || (MD_CHUNK_ALIGN_NEW_SIZE_SECTORS(chunk_size_in_sectors, child_object->size) < MD_CHUNK_ALIGN_NEW_SIZE_SECTORS(chunk_size_in_sectors, smallest_child->size))) {
					smallest_child = child_object;
					LOG_DEBUG("  (%"PRIu64") is smallest!.\n", MD_CHUNK_ALIGN_NEW_SIZE_SECTORS(chunk_size_in_sectors, child_object->size));
				}
			} else {
				LOG_DEBUG("  nope.\n");
			}
		}

		zone->nb_dev = c;
		zone->size = (MD_CHUNK_ALIGN_NEW_SIZE_SECTORS(chunk_size_in_sectors, smallest_child->size) - current_offset) * c;
		LOG_DEBUG("zone->nb_dev: %d, size: %"PRIu64"\n", zone->nb_dev, zone->size);

		if (!conf->smallest_zone || (zone->size < conf->smallest_zone->size))
			conf->smallest_zone = zone;

		zone->zone_offset = curr_zone_offset;
		curr_zone_offset += zone->size;

		current_offset = MD_CHUNK_ALIGN_NEW_SIZE_SECTORS(chunk_size_in_sectors, smallest_child->size);
		LOG_DEBUG("current zone offset: %"PRIu64"\n", current_offset);
	}

	conf->nr_zones = (volume->region->size + conf->smallest_zone->size - 1) / conf->smallest_zone->size;
	LOG_DEBUG("Number of zones is %d.\n", conf->nr_zones);

	/* Set up the hash tables. */
	cur = 0;

	conf->hash_table = EngFncs->engine_alloc(sizeof (struct raid0_hash) * conf->nr_zones);
	if (!conf->hash_table) {
		LOG_CRITICAL("Error %d allocating memory for zone hash table.\n", rc);
		LOG_EXIT_INT(rc);
		return rc;
	}
	size = conf->strip_zone[cur].size;

	i = 0;
	while (cur < conf->nr_strip_zones) {
		conf->hash_table[i].zone0 = conf->strip_zone + cur;

		/*
		 * If we completely fill the slot
		 */
		if (size >= conf->smallest_zone->size) {
			conf->hash_table[i++].zone1 = NULL;
			size -= conf->smallest_zone->size;

			/*
			 * If there is nothing left in the strip zone,
			 * move to the next srip zone.  Else, the
			 * next iteration of the loop will hit the
			 * code below where zone1 is filled in for this
			 * hash entry.
			 */
			if (!size) {
				if (++cur == conf->nr_strip_zones)
					continue;
				size = conf->strip_zone[cur].size;
			}
			continue;
		}
		if (++cur == conf->nr_strip_zones) {
			/*
			 * Last dev, set unit1 as NULL
			 */
			conf->hash_table[i].zone1=NULL;
			continue;
		}

		/*
		 * Here we use a 2nd dev to fill the slot
		 */
		zone0_size = size;
		size = conf->strip_zone[cur].size;
		conf->hash_table[i++].zone1 = conf->strip_zone + cur;
		size -= conf->smallest_zone->size - zone0_size;
	}

	LOG_EXIT_INT(0);
	return 0;
}


int raid0_create_region(md_volume_t * volume, list_anchor_t output_list, boolean final_call){
	int rc = 0;
	storage_object_t * region;
	int found = 0;
	int i, j = -1;
	unsigned int chunk_size_in_sectors;

	LOG_ENTRY();

	if ((!volume->super_array[0] || (volume->nr_disks !=  volume->super_array[0]->nr_disks)) &&
	    !final_call) {
		LOG_DETAILS("Region %s is missing members, delaying discovery\n",volume->name);
		LOG_EXIT_INT(0);
		return 0;
	}

	LOG_DETAILS("Discovered region %s.\n",volume->name);
	if ((rc = EngFncs->allocate_region(volume->name, &region))) {
		for (j = MAX_MD_DEVICES -1;(rc != 0) && (j >=0) ; j--) {
			sprintf(volume->name, "md/md%d",j);
			rc = EngFncs->allocate_region(volume->name, &region);
		}
		if (j<0) {
			LOG_ERROR("No more names for MD ");
			LOG_EXIT_INT(ENOMEM);
			return ENOMEM;
		}
	}
	volume->region = region;
	region->size = 0;  // initialize for size calculation
	for (i = 0; (i < MAX_MD_DEVICES) && (found < volume->nr_disks); i++) {
		// check for null object, if missing, skip and set corrupt flag
		if (volume->child_object[i]) {
			chunk_size_in_sectors = volume->super_array[i]->chunk_size >> EVMS_VSECTOR_SIZE_SHIFT;
			// if name registration failed and we changed the name, fix up all the minor numbers
			if (j >= 0) {
				volume->super_array[i]->md_minor = j;
			}
			md_append_region_to_object(region, volume->child_object[i]);
			LOG_DETAILS("Adding Object %s to %s\n",volume->child_object[i]->name,volume->name);
			region->size += MD_CHUNK_ALIGN_NEW_SIZE_SECTORS(chunk_size_in_sectors, volume->child_object[i]->size);
			found++;
		} else {
			MESSAGE("Region %s is corrupt, missing member object %d.\n",
				region->name, i);
			volume->flags |= MD_CORRUPT;
		}
	}

	region->data_type = DATA_TYPE;
	region->plugin = raid0_plugin;
	region->private_data = (void *)volume;
	volume->flags |= MD_DISCOVERED;
	volume->region = region;

	/* If the MD array is corrupt, mark the region object corrupt and exit */
	if (volume->flags & MD_CORRUPT) {
		region->flags |= SOFLAG_CORRUPT;
		md_add_object_to_list(region, output_list);
		LOG_EXIT_INT(rc);
		return rc;
	}

	volume->private_data = EngFncs->engine_alloc(sizeof (raid0_conf_t));
	if (volume->private_data) {
		rc = create_strip_zones(volume);
	} else {
		LOG_CRITICAL("Error %d allocating memory for raid 0 configuration structure.\n", rc);
		volume->flags |= MD_CORRUPT;
		region->flags |= SOFLAG_CORRUPT;
	}


	/*
	 * Query device-mapper for the status of this MD object.
	 * if this MD object is active, it's already activated as
	 * an DM device.  Otherwise, check with the MD kernel driver.
	 */
	rc = EngFncs->dm_update_status(region);
	if (!rc && (region->flags & SOFLAG_ACTIVE)) {
		LOG_DEBUG("Region %s is an active DM device (%d:%d)\n",
			region->name, region->dev_major, region->dev_minor);
	} else {
		rc = 0;
		region->dev_major = MD_MAJOR;
		region->dev_minor = volume->super_block->md_minor;
		MD_CHECK_ACTIVE(region);
	}
	
	if ((region->flags & SOFLAG_ACTIVE) == 0)
		region->flags |= SOFLAG_NEEDS_ACTIVATE;

	switch (raid0_verify_and_fix_array(volume, 0, 1)){
	case 1: // minor change, just fix it.
		raid0_verify_and_fix_array(volume, 1, 0);
		break;
	case 2: // major problem, mark both md volume and region corrupt!
		volume->flags |= MD_CORRUPT;
		region->flags |= SOFLAG_CORRUPT;
		break;
	default:
		break;
	}
	md_add_object_to_list(region, output_list);
	LOG_EXIT_INT(rc);
	return rc;
}




/* Function: discover_regions
 *
 *	run the global list of regions and pirce them together.
 */
int raid0_discover_regions( list_anchor_t output_list, int *count, boolean final_call )
{
	int rc = 0;
	md_volume_t * volume = volume_list_head;

	my_plugin = raid0_plugin;
	LOG_ENTRY();

	while (volume != NULL) {
		if ((!(volume->flags & MD_DISCOVERED)) && (volume->personality == RAID0)) {
		       	rc = raid0_create_region(volume, output_list, final_call);
		       	if (volume->flags & MD_DISCOVERED) {
		       		*count = *count + 1;
		       	}
		}
		volume = volume->next;
	}

	LOG_EXIT_INT(rc);
	return rc;
}

// verify the raid 1 array.  If 'fix' is 0 then just perform validation, return 0 if
// array is OK, 1 if array needs to be fixed.
// If 'fix' is 1, then fix up the array on the fly and return:
// 0 if untouched.
// 1 if minor fix made  (only used for major/minor numbers)
// 2 if major fix made

int  raid0_verify_and_fix_array(md_volume_t * volume, int fix, int do_msg)
{
	int     i;
	int 	change = 0;
	int 	nr_disks = 0, raid_disks = 0, spare_disks = 0, working_disks=0, active_disks=0;
	int 	failed_disks=0;
	int 	major, minor;
	mdp_disk_t disk;
	mdp_disk_t disk2;
	mdp_super_t *sb;

	LOG_ENTRY();
	
	if (!volume) {
		LOG_EXIT_INT(-1);
		return -1;
	}
	
	sb = volume->super_block;

	for (i = 0; i < MAX_MD_DEVICES && nr_disks < volume->nr_disks; i++ ) {
		if (!volume->child_object[i]) {
			// ok, found a hole
			LOG_WARNING("Region %s : Found a hole at index=%d\n", volume->name, i);
			if (!descriptor_removed(&sb->disks[i]) && 
				!descriptor_empty(&sb->disks[i])) {
				LOG_WARNING("According to the master superblock, the device(%d:%d) is missing\n",
					sb->disks[i].major, sb->disks[i].minor);
			}
			break;  // array should already be marked corrupt, quit.
		} else {
			nr_disks++;
			if (sb->disks[i].number != i ||
			    sb->disks[i].raid_disk != i) {
				change |= 2;
				if (!fix) {	  // not allowed to fix, quit now.
					if (do_msg) {
						MESSAGE("Region %s object index incorrect: is %d, should be %d.\n",
							volume->name, sb->disks[i].number,i);
					}
				} else {
					sb->disks[i].number = i;
					sb->disks[i].raid_disk = i;
				}
			}

			if (!(volume->flags & MD_USE_OLD_DEV)) {
				major = volume->child_object[i]->dev_major;
				minor = volume->child_object[i]->dev_minor;
				if ((sb->disks[i].major != major ||
				     sb->disks[i].minor != minor) &&
					(major != 0 || minor !=0)) {
					change |= 1;
					LOG_DEFAULT("Region %s object index %d (%s) has incorrect major/minor (%d:%d), should be (%d:%d).\n",
						volume->name, i, volume->child_object[i]->name,
						sb->disks[i].major,
						sb->disks[i].minor,
						major, minor);
					if (fix) {
						if (sb->disks[i].major != major)
							volume->commit_flag |= MD_COMMIT_SAVE_SB;
						sb->disks[i].major = major;
						sb->disks[i].minor = minor;
					}
				}
			}

			if (i >= sb->nr_disks) {
				change |= 2;
				if (fix) {
					sb->disks[i].state = (1 << MD_DISK_NEW);
				} else {
					if (do_msg) {
						MESSAGE("Region %s object index %d is greater than nr_disks.\n",
							volume->name, i);
					}
				}
			}

			switch (sb->disks[i].state) {
			case (1<<MD_DISK_ACTIVE | 1<<MD_DISK_SYNC):
			case (1<<MD_DISK_ACTIVE | 1<<MD_DISK_SYNC | 1<<MD_DISK_PENDING_ACTIVE ):
			case (1<<MD_DISK_ACTIVE | 1<<MD_DISK_SYNC | 1<<MD_DISK_PENDING_ACTIVE | 1<<MD_DISK_NEW):
			case (1<<MD_DISK_ACTIVE | 1<<MD_DISK_SYNC | 1<<MD_DISK_NEW):
       				active_disks++;
       				raid_disks++;
       				working_disks++;
       				break;

				// active, but not sync, kernel just kind of ignores these drives
				// so make him a spare so that the kernel will re-sync if needed.
				// or sync, but not active, do the same.
			case (1<<MD_DISK_ACTIVE):
			case (1<<MD_DISK_SYNC):
				change |= 2;
				if (!fix) {
					if (do_msg) {
						MESSAGE("Region %s object index %d is in invalid state.\n",volume->name, i);
					}
				}else {
					volume->super_block->disks[i].state =(1<<MD_DISK_PENDING_ACTIVE)| (1<<MD_DISK_NEW);
				}
			case 0:	// 0 = spare
			case (1<<MD_DISK_NEW):	// new = spare
			case (1<<MD_DISK_PENDING_ACTIVE):	// new = spare
			case (1<<MD_DISK_PENDING_ACTIVE | 1<<MD_DISK_NEW):	// new = spare
				spare_disks++;
       				working_disks++;
				break;
			case (1<<MD_DISK_REMOVED):
			case (1<<MD_DISK_FAULTY):
			case (1<<MD_DISK_FAULTY | 1<<MD_DISK_REMOVED):
			default:
				if (!fix) {
					if (do_msg) {
						MESSAGE("Region %s object index %d (%s) is faulty.  Array may be degraded.\n",
							volume->name, i, volume->child_object[i]->name);
					}
				}
				failed_disks++;
				break;
			}
		}
	}

	// check to be sure that all of the unused disks array entries are zeroed
	// If not, the boneheaded kernel MD code will use these even though
	// the count field indicates athat they are not valid.
	// To make matters worse, only raid4/5 and 1 work this way, so since we have
	// a common SB creation routine we can not always be right.  So just allow
	// these extras disks entries to have the sync bit on or off.
	memset(&disk, 0, sizeof(mdp_disk_t));
	memset(&disk2, 0, sizeof(mdp_disk_t));
	disk.state = (1<<MD_DISK_SYNC);
	for (i = volume->nr_disks; i < MAX_MD_DEVICES; i++) {
		mdp_disk_t *descriptor = &sb->disks[i];

		if (descriptor_removed(descriptor))
			continue;

		if (memcmp(&disk, descriptor, sizeof(mdp_disk_t)) &&
		    memcmp(&disk2, descriptor, sizeof(mdp_disk_t))) {
			change |= 2;
			if (!fix) {
				if (do_msg) {
					MESSAGE("Region %s disks array not zeroed.\n",volume->name);
				}
			} else{
				memcpy(descriptor, &disk, sizeof(mdp_disk_t));
			}
		}
	}


	if (sb->active_disks != active_disks ||
	    sb->working_disks != working_disks ||
	    sb->failed_disks != failed_disks ||
	    sb->spare_disks != spare_disks ||
	    sb->nr_disks != nr_disks ) {
		change |= 2;
		if (!fix) {
			if (do_msg) {
				MESSAGE("Region %s disk counts incorrect.\n",volume->name);
			}
		} else {
			sb->active_disks = active_disks;
			sb->working_disks = working_disks;
			sb->failed_disks = failed_disks;
			sb->spare_disks = spare_disks;
			sb->nr_disks = nr_disks;
		}
	}

	if (fix) {
		volume->flags &= ~MD_CORRUPT;
		volume->flags &= ~MD_DEGRADED;
		volume->region->flags |= SOFLAG_DIRTY;
	}
	LOG_EXIT_INT(change);
	return change;
}

