/*  Screem:  screem-markup.c
 *
 *  Copyright (C) 2002 David A Knight
 *
 *  This program is free software; you can redistribute it and/or modify
 *  it under the terms of the GNU General Public License as published by
 *  the Free Software Foundation; either version 2 of the License, or
 *  (at your option) any later version.
 *
 *  This program is distributed in the hope that it will be useful,
 *  but WITHOUT ANY WARRANTY; without even the implied warranty of
 *  MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
 *  GNU General Public License for more details.
 *
 *  You should have received a copy of the GNU General Public License
 *  along with this program; if not, write to the Free Software
 *  Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA  02111-1307  USA
 *
 *  For contact information with the author of this source code please see
 *  the AUTHORS file.  If there is no AUTHORS file present then check the
 *  about box under the help menu for a contact address
 */

#include <gconf/gconf.h>

#include <glib/gstring.h>
#include <glib/gstrfuncs.h>
#include <glib/gmem.h>
#include <glib/gslist.h>
#include <glib/gmessages.h>

#include <gdk/gdkkeysyms.h>

#include <ctype.h>
#include <string.h>

#include "screem-application.h"
#include "screem-dtd.h"

#include "screem-site.h"
#include "screem-page.h"
#include "screem-search.h"

#include "fileops.h"

#include "support.h"

typedef gunichar(*Alter)(int c);


static void screem_markup_fix_html_links( ScreemSite *site,
					  ScreemPage *page,
					  const gchar *pagepath,
					  const gchar *origpagepath,
					  const gchar *source,
					  const gchar *dest );
static void screem_markup_fix_css_links( ScreemSite *site,
					 ScreemPage *page,
					 const gchar *pagepath,
					 const gchar *origpagepath,
					 const gchar *source,
					 const gchar *dest );

static const gchar *screem_markup_get_region( const gchar *text, 
					      GSList *blocks );

const gchar* screem_markup_char_to_ent( guint c )
{
	const gchar *tag;

	switch( c ) {
	case GDK_ampersand:
		tag = "amp";
		break;
	case GDK_Agrave:
		tag = "Agrave";
		break ;
	case GDK_Aacute:
		tag = "Aacute";
		break ;
	case GDK_Acircumflex:
		tag = "Acirc";
		break ;
	case GDK_Atilde:
		tag = "Atilde";
		break ;
	case GDK_Adiaeresis:
		tag = "Auml";
		break ;
	case GDK_Aring:
		tag = "Aring";
		break ;
	case GDK_AE:
		tag = "AElig";
		break ;
	case GDK_Ccedilla:
		tag = "Ccedil";
		break ;
	case GDK_Egrave:
		tag = "Egrave";
		break ;
	case GDK_Eacute:
		tag = "Eacute";
		break ;
	case GDK_Ecircumflex:
		tag = "Ecirc";
		break ;
	case GDK_Ediaeresis:
		tag = "Euml";
		break ;
	case GDK_Igrave:
		tag = "Igrave";
		break ;
	case GDK_Iacute:
		tag = "Iacute";
		break ;
	case GDK_Icircumflex:
		tag = "Icirc";
		break ;
	case GDK_Idiaeresis:
		tag = "Iuml";
		break ;
	case GDK_ETH:
		tag = "ETH";
		break ;
	case GDK_Ntilde:
		tag = "Ntilde";
		break ;
	case GDK_Ograve:
		tag = "Ograve";
		break ;
	case GDK_Oacute:
		tag = "Oacute";
		break ;
	case GDK_Ocircumflex:
		tag = "Ocirc";
		break ;
	case GDK_Otilde:
		tag = "Otilde";
		break ;
	case GDK_Odiaeresis:
		tag = "Ouml";
		break ;
	case GDK_Ooblique:
		tag = "Oslash";
		break ;
	case GDK_Ugrave:
		tag = "Ugrave";
		break ;
	case GDK_Uacute:
		tag = "Uacute";
		break ;
	case GDK_Ucircumflex:
		tag = "Ucirc";
		break ;
	case GDK_Udiaeresis:
		tag = "Uuml";
		break ;
	case GDK_Yacute:
		tag = "Yacute";
		break ;
	case GDK_THORN:
		tag = "THORN";
		break ;
	case GDK_ssharp:
		tag = "szlig";
		break ;
	case GDK_agrave:
		tag = "agrave";
		break ;
	case GDK_aacute:
		tag = "aacute";
		break ;
	case GDK_acircumflex:
		tag = "acirc";
		break ;
	case GDK_atilde:
		tag = "atilde";
		break ;
	case GDK_adiaeresis:
		tag = "auml";
		break ;
	case GDK_aring:
		tag = "aring";
		break ;
	case GDK_ae:
		tag = "aelig";
		break ;
	case GDK_ccedilla:
		tag = "ccedil";
		break ;
	case GDK_egrave:
		tag = "egrave";
		break ;
	case GDK_eacute:
		tag = "eacute";
		break ;
	case GDK_ecircumflex:
		tag = "ecirc";
		break ;
	case GDK_ediaeresis:
		tag = "euml";
		break ;
	case GDK_igrave:
		tag = "igrave";
		break ;
	case GDK_iacute:
		tag = "iacute";
		break ;
	case GDK_icircumflex:
		tag = "icirc";
		break ;
	case GDK_idiaeresis:
		tag = "iuml";
		break ;
	case GDK_eth:
		tag = "eth";
		break ;
	case GDK_ntilde:
		tag = "ntilde";
		break ;
	case GDK_ograve:
		tag = "ograve";
		break ;
	case GDK_oacute:
		tag = "oacute";
		break ;
	case GDK_ocircumflex:
		tag = "ocirc";
		break ;
	case GDK_otilde:
		tag = "otilde";
		break ;
	case GDK_odiaeresis:
		tag = "ouml";
		break ;
	case GDK_oslash:
		tag = "oslash";
		break ;
	case GDK_ugrave:
		tag = "ugrave";
		break ;
	case GDK_uacute:
		tag = "uacute";
		break ;
	case GDK_ucircumflex:
		tag = "ucirc";
		break ;
	case GDK_udiaeresis:
		tag = "uuml";
		break ;
	case GDK_yacute:
		tag = "yacute";
		break ;
	case GDK_thorn:
		tag = "thorn";
		break ;
	case GDK_ydiaeresis:
		tag = "yuml";
		break ;
	case GDK_exclamdown:
		tag = "iexcl";
		break ;
	case GDK_cent:
		tag = "cent";
		break ;
	case GDK_sterling:
		tag = "pound";
		break ;
	case GDK_currency:
		tag = "curren";
		break ;
	case GDK_yen:
		tag = "yen";
		break ;
	case GDK_brokenbar:
		tag = "brkbar";
		break ;
	case GDK_section:
		tag = "sect";
		break ;
	case GDK_diaeresis:
		tag = "uml";
		break ;
	case GDK_copyright:
		tag = "copy";
		break ;
	case GDK_ordfeminine:
		tag = "ordf";
		break ;
	case GDK_guillemotleft:
		tag = "laqo";
		break ;
	case GDK_notsign:
		tag = "not";
		break ;
	case GDK_hyphen:
		tag = "shy";
		break ;
	case GDK_registered:
		tag = "reg";
		break ;
	case GDK_macron:
		tag = "macr";
		break ;
	case GDK_degree:
		tag = "deg";
		break ;
	case GDK_plusminus:
		tag = "plusmn";
		break ;
	case GDK_twosuperior:
		tag = "suo2";
		break ;
	case GDK_threesuperior:
		tag = "suo3";
		break ;
	case GDK_acute:
		tag = "acute";
		break ;
	case GDK_mu:
		tag = "micro";
		break ;
	case GDK_paragraph:
		tag = "para";
		break ;
	case GDK_periodcentered:
		tag = "middot";
		break ;
	case GDK_cedilla:
		tag = "cedil";
		break ;
	case GDK_onesuperior:
		tag = "sup1";
		break ;
	case GDK_masculine:
		tag = "ordm";
		break ;
	case GDK_guillemotright:
		tag = "raquo";
		break ;
	case GDK_onequarter:
		tag = "fraq14";
		break ;
	case GDK_onehalf:
		tag = "fraq12";
		break ;
	case GDK_threequarters:
		tag = "fraq34";
		break ;
	case GDK_questiondown:
		tag = "iquest";
		break ;
	case GDK_quotedbl:
		tag = "quot";
		break;
	case GDK_Tab:
		tag = "nbsp";
		break;
	case GDK_less:
		tag = "lt";
		break;
	case GDK_greater:
		tag = "gt";
		break;
	default:
		tag = NULL;
	}

	return tag;
}

gboolean screem_markup_is_entity( const gchar *text, gint pos,
				  gint *start, gint *end )
{
	gboolean ret = FALSE;
	gint i;
	gunichar c;
	const gchar *stext;
	const gchar *tmp;
	
	/* first go back */
	stext = text;
	text = g_utf8_offset_to_pointer( text, pos );
	tmp = text;
	i = pos;
	if( pos > 0 && *tmp == ';' ) {
		tmp = g_utf8_prev_char( tmp );
		i --;
	}

	for( c = g_utf8_get_char( tmp );
		( tmp > stext &&  ( ! g_unichar_isspace( c ) ) &&
		   c != '&' && c != ';' );
		c = g_utf8_get_char( tmp ), -- i ) {
		tmp = g_utf8_prev_char( tmp );
	}
	
	ret = ( c == '&' );
	if( start ) {
		*start = i;
	}
	
	if( ret ) {
		/* now go forward */
		i = pos;
		tmp = text;
		if( *tmp == '&' ) {
			tmp = g_utf8_next_char( tmp );
			i ++;
		}
		for( c = g_utf8_get_char( tmp );
			c != '\0' && ( ! g_unichar_isspace( c ) ) &&
			c != '&' && c != ';';
			c = g_utf8_get_char( tmp ), ++ i ) {
			tmp = g_utf8_next_char( tmp );
		}
			
		ret = ( c == ';' );
		if( end ) {
			*end = i;
		}
	}

	return ret;
}

gboolean screem_markup_is_attribute( const gchar *text, gint pos,
				     gint *start, gint *end )
{
	gboolean ret = FALSE;
	gunichar c;
	const gchar *stext;
	const gchar *tmp;
	gint i;

	if( pos < 0 ) {
		pos = 0;
	}
	
	/* find end of tag */
	stext = text;
	text = g_utf8_offset_to_pointer( text, pos );
	tmp = text;
	i = pos;
	for( c = g_utf8_get_char( tmp );
		c != '\0' && c != '>' && c != '<';
		c = g_utf8_get_char( tmp ), ++ i ) {
		tmp = g_utf8_next_char( tmp );
	}
	ret = ( c == '>' );
	
	/* move back along the tag counting opening attributes */
	if( ret ) {
		gint e = -1;
		gunichar tc;
		
		do {
			tmp = g_utf8_prev_char( tmp );
			i --;
			tc = g_utf8_get_char( tmp );
	
			if( tc != ' ' ) {
				/* we got something */

				if( tc == '"' && c != '"' ) {
					c = '"';
					e = i;
				} else if( tc == '\'' && c != '\'' ) {
					c = '\'';
					e = i;
				} else if( tc == '=' && c != '\0' ) {
					c = '\0';
					e = -1;
				} else if( c == '\0' ) {
					e = i;
					c = tc;
				}
			} else if( c != '"' && c != '\'' && c != '\0' ) {
				c = '\0';
				e = -1;
			}
		
		} while( ( i > pos ) && ( tmp > stext ) && 
				( tc != '<' ) );
		
		ret = ( i == pos && e != -1 );
		if( ret && end ) {
			*end = e;
		}
	}

	if( ret ) {
		/* scan from the start of the tag
		   to confirm if we really are in an attribute,
		   or if the markup is just broken */
		tmp = text;
		for( c = g_utf8_get_char( tmp );
			i > 0 && c != '<' && c != '>'; 
			c = g_utf8_get_char( tmp ), -- i ) {
			tmp = g_utf8_prev_char( tmp );
		}
		ret = ( c == '<' );
	}
	if( ret ) {
		/* begin forward scan */
		gint s = -1;
		gunichar tc;
		
		for( tc = g_utf8_get_char( tmp );
			i < pos && tc != '>'; ) {
			
			tmp = g_utf8_next_char( tmp );
			i ++;
			tc = g_utf8_get_char( tmp );
	
			if( tc != ' ' ) {
				/* we got something */

				if( tc == '"' && c != '"' ) {
					c = '"';
					s = i;
				} else if( tc == '\'' && c != '\'' ) {
					c = '\'';
					s = i;
				} else if( tc == '=' && c != '\0' ) {
					c = '\0';
					s = -1;
				} else if( c == '\0' ) {
					s = i;
					c = tc;
				}
			} else if( c != '"' && c != '\'' && c != '\0' ) {
				c = '\0';
				s = -1;
			}
		}
		
		ret = ( i == pos && s != -1 );
		if( ret && start ) {
			*start = s;
		}
	}

	return ret;
}

gboolean screem_markup_is_tag( const gchar *text, gint pos,
			       gint *start, gint *end )
{
	gint i;
	gunichar c;
	const gchar *stext;
	const gchar *tmp;
	gboolean ret = FALSE;

	/* first go back */
	stext = text;
	text = g_utf8_offset_to_pointer( text, pos );
	tmp = text;
	i = pos;

	if( pos > 0 && *tmp == '>' ) {
		tmp = g_utf8_prev_char( tmp );
		i --;
	}	
	for( c = g_utf8_get_char( tmp );
		tmp > stext &&
		   c != '<' && c != '>';
		c = g_utf8_get_char( tmp ), -- i ) {
		tmp = g_utf8_prev_char( tmp );
	}
	
	ret = ( c == '<' );
	if( start ) {
		*start = i;
	}
	
	if( ret ) {
		/* go forward now */
		const gchar *temp;

		i = pos;
		temp = NULL;
		tmp = g_utf8_next_char( tmp );
		c = g_utf8_get_char( tmp );
		i ++;

		/* special case for comments */
		if( c == '!' && tmp[ 1 ] == '-' && tmp[ 2 ] == '-' ) {
			temp = strstr( tmp + 3, "-->" );
			if( temp ) {
				i += g_utf8_pointer_to_offset( stext,
							temp ) -
					g_utf8_pointer_to_offset( stext,
							tmp );
				i += 4;
			}
			ret = ( temp != NULL );
		} else if( c == '!' &&
			tmp[ 1 ] == '[' &&
			tmp[ 2 ] == 'C' &&
			tmp[ 3 ] == 'D' &&
			tmp[ 4 ] == 'A' &&
			tmp[ 5 ] == 'T' &&
			tmp[ 6 ] == 'A' ) {
			/* special case for <![CDATA[ */
			temp = strstr( tmp + 7, "]]>" );
			if( temp ) {
				i += g_utf8_pointer_to_offset( stext,
							temp ) -
					g_utf8_pointer_to_offset( stext,
							tmp );
				i += 4;
			}
		} else if( ( c == '%' || c == '?' ) ) {
			/* special case for asp, php etc */
			const gchar *ptemp;
			gunichar tc;

			temp = g_utf8_next_char( tmp );
			
			while( temp ) {
				temp = g_utf8_strchr( temp, -1, c );
				tc = '\0';
				ptemp = NULL;
				if( temp ) {
					ptemp = g_utf8_next_char( temp );
					tc = g_utf8_get_char( ptemp );
				}
				if( tc == '>' ) {
					i += g_utf8_pointer_to_offset( stext, temp ) - g_utf8_pointer_to_offset( stext, tmp );
					i += 2;
					break;
				}
				temp = ptemp;
			}
			ret = ( temp != NULL );
		} else {
			while( c != '\0' && c != '>' )	{
				i ++;
				tmp = g_utf8_next_char( tmp );
				c = g_utf8_get_char( tmp );
			}
			ret = ( c == '>' );
		}
		if( end ) {
			*end = i;
		}
	}

	return ret;
}

gchar *screem_markup_next_tag( const gchar *text, gint pos, 
			       gint *start, gint *end, gchar **name )
{
	const gchar *tag;
	const gchar *tend;
	const gchar *temp;
	gunichar c;
	gchar *ret;

	gint s;
	gint e;
	
	if( pos < 0 ) {
		g_warning( "ERROR\n" );
		return NULL;
	}
	
	if( ! start ) {
		start = &s;
	}
	if( ! end ) {
		end = &e;
	}
	
	ret = NULL;
	tag = g_utf8_offset_to_pointer( text, pos );
	tag = g_utf8_strchr( tag, -1, '<' );
	if( tag ) {
		temp = tend = g_utf8_next_char( tag );
		c = g_utf8_get_char( tend );
		switch( c ) {
		case '?':
			/* special case for PI */
			if( name ) {
				while( c != '\0' && c != '>' &&
					! g_unichar_isspace( c ) ) {
					tend = g_utf8_next_char( tend );
					c = g_utf8_get_char( tend );
				}
				*name = g_strndup( temp, 
						tend - temp );
			}
			if( tend ) {
				tend = strstr( tend, "?>" );
			}
			if( tend ) {
				tend += strlen( "?>" );
			}
			break;
		case '%':
			/* special case for ASP stuff */
			if( name ) {
				*name = g_strdup( "%" );
			}
			tend = strstr( tend, "%>" );
			if( tend ) {
				tend += strlen( "%>" );
			}
			break;
		case '!':
			if( tend[ 1 ] == '-' && tend[ 2 ] == '-' ) {
				/* special case for comments */
				if( name ) {
					*name = g_strdup( "!--" );
				}
				tend = strstr( tend, "-->" );
				if( tend ) {
					tend += strlen( "-->" );
				} 
				break;
			} else if( tend[ 1 ] == '[' &&
				   tend[ 2 ] == 'C' &&
				   tend[ 3 ] == 'D' &&
				   tend[ 4 ] == 'A' &&
				   tend[ 5 ] == 'T' &&
				   tend[ 6 ] == 'A' ) {
				/* special case for CDATA sections */
				if( name ) {
					*name = g_strdup( "[CDATA]" );
				}
				tend = strstr( tend, "]]>" );
				if( tend ) {
					tend += strlen( "]]>" );
				}
				break;
			}
		/* deliberate drop through */
		default:
			if( name ) {
				do {
					tend = g_utf8_next_char( tend );
					c = g_utf8_get_char( tend );
				} while( c != '/' &&
					c != '\0' && c != '>' &&
					! g_unichar_isspace( c ) ); 

				*name = g_strndup( temp, 
						tend - temp );
			}
			tend = g_utf8_strchr( tend, -1, '>' );
			if( tend ) {
				tend = g_utf8_next_char( tend );
			}
			break;
		}
		/* if the tag isn't closed we haven't found a valid tag */
		if( ! tend ) {
			if( name ) {
				g_free( *name );
				*name = NULL;
			}
		} else {
			*start = g_utf8_pointer_to_offset( text, tag );
			*end = g_utf8_pointer_to_offset( text, tend );

			ret = g_strndup( tag, tend - tag ); 
		}
	}
	 
	return ret;
}

gchar *screem_markup_encode_text( const gchar *text )
{
	GString *string;
	gunichar c;
	gchar *ret;

	string = g_string_new( NULL );

	for( c = g_utf8_get_char( text ); c != '\0'; 
		text = g_utf8_next_char( text ),
		c = g_utf8_get_char( text ) ) {
		const gchar *ent = screem_markup_char_to_ent( c );
		if( ! ent || screem_markup_is_entity( text, 0, NULL, NULL ) ) {
			g_string_append_unichar( string, c );
		} else {
			g_string_append_c( string, '&' );
			g_string_append( string, ent );
			g_string_append_c( string, ';' );
		}
	}

	ret = string->str;
	g_string_free( string, FALSE );

	return ret;
}

gchar *screem_markup_change_case( const gchar *text, gboolean upper )
{
	gint pos = 0;
	gchar *next;
	const gchar *tag;
	gchar *name;

	gboolean ok;
	gboolean in_attr;
	gchar attr_term = '\0';

	Alter alter;
	gunichar c;

	GString *str;
	guint start;
	guint end;
	gchar *ret;
	const gchar *offset;
	guint len;
	
	if( upper ) {
		alter = (Alter)g_unichar_toupper;
	} else {
		alter = (Alter)g_unichar_tolower;
	}
	str = g_string_new( NULL );
	
	while( ( next = screem_markup_next_tag( text, pos, &start, &end,
						&name ) ) ) {

		offset = g_utf8_offset_to_pointer( text, pos );
		len = start - pos;
		if( len ) {
			g_string_append_utf8_len( str, offset, len );
		}
		
		pos = end;
		offset = g_utf8_offset_to_pointer( text, start );
		
		/* non markup tag, so we don't change case */
		tag = NULL;
		if( ! ( ok = g_strncasecmp( "!", name, 1 ) ) ) {
			len = end - start;
			if( len ) {
				g_string_append_utf8_len( str, 
						offset, len );
			}
			c = '\0';
		} else { 		
			tag = offset;
			c = g_utf8_get_char( tag );
		}
		for( in_attr = FALSE; ok && c != '>'; c = g_utf8_get_char( tag ) ) {
			if( in_attr ) {
				if( attr_term == '\0') {
					attr_term = ( c == '"') ?  '"' : ' ';
				} else {
					in_attr = ( c != attr_term );
				}
			} else if( g_unichar_isalpha( c ) ) {
				c = alter( c );
			} else if( c == '=' ) {
				in_attr = TRUE;
				attr_term = '\0';
			}
			g_string_append_unichar( str, c );
			tag = g_utf8_next_char( tag );
		}
		if( c != '\0' ) {
			g_string_append_unichar( str, c );
		}
		g_free( name );
		g_free( next );
	}
	offset = g_utf8_offset_to_pointer( text, pos );
	g_string_append( str, offset );

	ret = str->str;
	g_string_free( str, FALSE );

	return ret;
}

gboolean screem_markup_next_tag_close( const gchar *text,
				       const gchar *tname, gint pos )
{
	gchar *next;
	gchar *name;
	gboolean ret;

	next = screem_markup_next_tag( text, pos, NULL, NULL, &name );

	ret = FALSE;
	if( next ) {
		ret = ( name[ 0 ] == '/' && 
			! g_strcasecmp( name + 1, tname ) );
		
		g_free( name );
		g_free( next );
	}
	
	return ret;
}

/* NOTE: this returns the list in reverse order, I.E
   end of tag <-- value, name, value, name  -->  start of tag */
GSList *screem_markup_build_attributes_list( const gchar *tag, GSList *list )
{
	const gchar *tmp;
	gunichar c;
	
	tmp = tag;
	do {
		const gchar *equals;
		gchar *name;
		gchar *value;

		tmp = screem_utf8_skip_to_space( tmp );
		tmp = g_utf8_skip_space( tmp );

		if( *tmp == '\0' || *tmp == '>' || *tmp == '/' ) {
			break;
		}
	
		equals = g_utf8_strchr( tmp, -1, '=' );
		if( ! equals ) {
			/* assume single name attr,
			   such as noshade in html on an hr */
			equals = screem_utf8_skip_to_space( tmp );
		
			if( *equals == '\0' ) {
				break;
			}
		}

		/* we have an attr name from tmp to equals */
		name = g_strndup( tmp, equals - tmp );
		if( *equals == ' ' ) {
			value = NULL;
			tmp = equals;
		} else {
			equals = g_utf8_next_char( equals );
			equals = g_utf8_skip_space( equals );
			c = g_utf8_get_char( equals );
			if( c == '"' || c == '\'' ) {
				equals = g_utf8_next_char( equals );
				tmp = equals;
				do {
					tmp = g_utf8_strchr( tmp,
							-1, c );
				} while( tmp && *(tmp-1) == '\\' );
			} else if( c == '\\' ) {
				equals ++;
				if( *equals == '"' ) {
					equals ++;
					tmp = strstr( equals, "\\\"" );
				} else if( *equals == '\'' ) {
					equals ++;
					tmp = strstr( equals, "\\'" );
				} else {
					/* treats as unquoted */
					tmp = strchr( equals, ' ' );
				}
			} else {
				tmp = screem_utf8_skip_to_space( equals );
			}

			if( ! tmp ) {
				tmp = strchr( equals, '>' );
			
				if( ! tmp ) {
					g_free( name );
					break;
				}
			}
			value = g_strndup( equals, tmp - equals );
		}
		/* add to list */
		name = g_strchug( name );
		list = g_slist_prepend( list, name );
		list = g_slist_prepend( list, value );
	} while( tmp );

	return list;
}

GSList *screem_markup_get_links( ScreemDTD *dtd, const gchar *text )
{
	GSList *ret;
	gchar *next;
	gint pos;
	gint end;
	gchar *name;

	pos = 0;
	end = 0;
	ret = NULL;

	while( ( next = screem_markup_next_tag( text, pos, 
						NULL, &end, &name ) ) ) {
		/* get tag attributes */
		GSList *attr;
		GSList *tmp;

		pos = end;

		attr = screem_markup_build_attributes_list( next, NULL );

		/* any links among them? */
		for( tmp = attr; tmp; tmp = tmp->next ) {
			GSList *temp;

			temp = tmp;
			tmp = tmp->next;

			if( dtd ) {
				if( screem_dtd_attr_is_uri( dtd, name,
						    tmp->data ) ) {
					ret = g_slist_prepend( ret, temp->data );
				} else {
					g_free( temp->data );
				}
			} else if( ! g_strcasecmp("src",
					(const gchar*)tmp->data ) ||
				   ! g_strcasecmp("data",
					(const gchar*)tmp->data ) ||
				   ! g_strcasecmp("href",
					(const gchar*)tmp->data) ||
				   ! g_strcasecmp("url",
					(const gchar*)tmp->data ) ) {
				/* fallback mode if no dtd is passed,
				   then we check src="" href="" and data="" */
				ret = g_slist_prepend( ret, temp->data );
			} else {
				g_free( temp->data );
			}
			g_free( tmp->data );
		}
		g_slist_free( attr );
		
		g_free( next );
		g_free( name );
	}

	ret = g_slist_reverse( ret );

	return ret;
}

void screem_markup_fix_links( ScreemSite *site,
			      ScreemPage *page,
			      const gchar *pagepath,
			      const gchar *origpagepath,
			      const gchar *source,
			      const gchar *dest )
{
	const gchar *mime_type;
	gboolean markup;

	if( screem_site_get_fake_flag( site ) ) {
		return;
	}
	
	mime_type = screem_page_get_mime_type( page );

	markup = screem_page_is_markup( page );

	/* call different link fixing functions depending on
	   what type of file page is */
	if( markup ) {
		screem_markup_fix_html_links( site, page, 
					      pagepath, origpagepath,
					      source, dest );
	} else if( mime_type && ! strcmp( "text/css", mime_type ) ) {
		screem_markup_fix_css_links( site, page,
					     pagepath, origpagepath,
					     source, dest );
	}
}

static void process_attrs( GString *str, const gchar *tag,
			const gchar *name, ScreemDTD *dtd,
			gboolean isdir, gboolean update_all,
			const gchar *pageroot,
			const gchar *destroot,
			const gchar *source,
			const gchar *dest )
{
	GSList *attrs;
	GSList *tmp;
	
	gchar *aname;
	gchar *value;

	const gchar *offset;
	const gchar *aoffset;
	guint chars;
	gunichar c;
	gboolean added;

	GnomeVFSURI *uri;
	const gchar *fragment;
	gchar *match;
	gchar *newpath;
	gchar *temp;
	
	attrs = screem_markup_build_attributes_list( tag, NULL );
	attrs = g_slist_reverse( attrs );
	
	offset = tag + strlen( name );
	offset ++; /* for the < */

	for( tmp = attrs; tmp; tmp = tmp->next ) {
		aname = tmp->data;
		tmp = tmp->next;
		value = tmp->data;

		aoffset = strstr( offset, aname );

		g_assert( aoffset != NULL );
		
		if( aoffset != offset ) {
			chars = g_utf8_pointer_to_offset( offset,
					aoffset );
			g_string_append_len( str, offset,
					aoffset - offset );
			offset = aoffset;
		}
		offset += strlen( aname );
		
		if( value ) {
			offset ++; /* for the = */
		}
		c = g_utf8_get_char( offset );
		
		/* always quote attributes */
		added = FALSE;
		if( c != '"' && c != '\'' ) {
			c = '"';
			added = TRUE;
		}
		
		g_string_append_printf( str, "%s", aname );
		if( value ) {
			offset += strlen( value );
			if( ! added ) {
				offset += 2; /* for the quotes */
			}
			g_string_append_c( str, '=' );
			g_string_append_unichar( str, c );

			/* process value if aname is a
			 * uri attribute */
			if( screem_dtd_attr_is_uri( dtd, name,
						aname ) ) {

				uri = gnome_vfs_uri_new( value );
				fragment = "";
				if( uri ) {
					fragment = gnome_vfs_uri_get_fragment_identifier( uri );
				}
				if( ! update_all ) {
					match = paths_match( pageroot,
							value,
							source );
					newpath = relative_path( dest,
							pageroot );
					if( isdir && match ) {
						temp = newpath;

						newpath = g_strconcat( newpath, match + strlen( source ), NULL );
						g_free( temp );
					}
				} else {
					match = relative_to_full( value,
							pageroot );
					newpath = relative_path( match,
							destroot );
				}

				if( match ) {
					g_free( value );
					value = newpath;
				}
				if( match && fragment ) {
					temp = value;
					value = g_strdup_printf( "%s#%s", value, fragment );
					g_free( temp );
				}
				g_free( match );
				if( uri ) {
					gnome_vfs_uri_unref( uri );
				}
			}
		
			g_string_append( str, value );
			g_string_append_unichar( str, c );
		}
		g_free( aname );
		g_free( value );

	}
	/* add any extra spacing extra that there was, and also
	 * the possible / */
	g_string_append( str, offset );
}

static void screem_markup_fix_html_links( ScreemSite *site,
					  ScreemPage *page,
					  const gchar *pagepath,
					  const gchar *origpagepath,
					  const gchar *source,
					  const gchar *dest )
{
	guint pos;
	guint ppos;
	gchar *text;
	const gchar *offset_text;
	const gchar *poffset_text;
	gchar *name;
	gchar *tag;
	guint start;
	guint end;
	guint last;

	gchar *pageroot;
	gchar *destroot;
	gboolean isdir;
	gboolean update_all;
	gboolean tupdate;
	ScreemDTD *dtd;
	gboolean ineditable;
	
	GString *str;

	text = screem_page_get_data( page );
	offset_text = text;
	poffset_text = name = tag = NULL;
	pos = ppos = start = end = 0;
	last = gtk_text_buffer_get_char_count( GTK_TEXT_BUFFER( page ) );

	str = g_string_new( NULL );
	pageroot = g_path_get_dirname( origpagepath );
	destroot = g_path_get_dirname( pagepath );
	isdir = screem_uri_is_dir( source );
	update_all = TRUE;
	tupdate = TRUE;
	if( source && dest ) {
		tupdate = FALSE;
		update_all = ( ! strncmp( pagepath, dest,
					strlen( dest ) ) );
	}
	dtd = screem_page_get_dtd( page );
	ineditable = FALSE;
	
	do {
		ppos = pos;
		name = NULL;
		start = end = 0;

		tag = screem_markup_next_tag( offset_text, 0,
					&start, &end, &name );

		poffset_text = offset_text;
		offset_text = g_utf8_offset_to_pointer( offset_text,
							end );

		start += pos;
		end += pos;
	
		if( ! tag ) {
			pos = last;
		} else {
			pos = start;
		}
		
		if( pos > ppos ) {
			/* text node */
			g_string_append_len( str, poffset_text,
					g_utf8_offset_to_pointer( poffset_text, pos - ppos ) - poffset_text );
		}

		if( tag ) {
			if( tupdate && *name == '!' ) {
				if( ! strncmp( "<!-- #BeginEditable ",
						tag,
						strlen( "<!-- #BeginEditable " ) ) ) {
					ineditable = TRUE;
				} else if( ! strncmp( "<!-- #EndEditable ", 
						tag, 
						strlen( "<!-- #EndEditable " ) ) ) {
					ineditable = FALSE;
				}

			}

			if( ( ! ineditable ) &&
				*name != '/' &&
				*name != '!' &&
				*name != '?' &&
				*name != '%' ) {
				g_string_append_printf( str,
						"<%s", name );

				/* process attrs */
				process_attrs( str, tag, name, dtd,
						isdir, update_all,
						pageroot, destroot,
						source, dest );
			} else {
				/* append tag as is */
				g_string_append( str, tag );
			}
			
			g_free( tag );
			g_free( name );
		} 
	} while( tag != NULL );

	g_free( destroot );
	g_free( pageroot );
	
	/* only set the data if it has actually changed */
	if( strcmp( str->str, text ) ) {
		screem_page_set_data( page, str->str );
	}
	g_string_free( str, TRUE );
	
	g_free( text );
}

static void screem_markup_fix_css_links( ScreemSite *site,
					 ScreemPage *page,
					 const gchar *pagepath,
					 const gchar *origpagepath,
					 const gchar *source,
					 const gchar *dest )
{
	gchar *pageroot;
	gchar *destroot;

	gboolean isdir;

	gchar *match;
	gboolean update_all;

	gchar *data;
	gint length;
	GString *newdata;
	gchar *temp;
	gchar *tmp;

	gunichar c;

	pageroot = g_path_get_dirname( origpagepath );
	destroot = g_path_get_dirname( pagepath );

	isdir = screem_uri_is_dir( source );

	update_all = TRUE;
	if( source && dest ) {
		update_all = ( ! strncmp( pagepath, dest, 
					  strlen( dest ) ) );
	}

	/* need to check for and update @import, url() */
	data = screem_page_get_data( page );
	length = strlen( data );
	newdata = g_string_new( NULL );

	/* look for @imports first as the must be
	   at the start of the sheet */
	temp = data;
	while( ( tmp = strstr( temp, "@import" ) ) ) {
		gboolean isurl;

		tmp += strlen( "@import" );

		g_string_append_len( newdata, temp, tmp - temp );
		temp = tmp;

		isurl = FALSE;
		while( *tmp != '\0' ) {
			gchar *url;

			if( ! strncmp( "url(", tmp, strlen( "url(" ) ) ) {
				tmp += strlen( "url(" );
				isurl = TRUE;
			} else {
				tmp ++;
			}

			g_string_append_len( newdata, temp, tmp - temp );
			temp = tmp;

			c = g_utf8_get_char( tmp );
			if( ( isurl && ! g_unichar_isspace( c ) ) ||
			    c == '"' || c == '\'' ) {
				/* found url */
				gchar *end;
				gboolean leave;
				gchar *newpath;

				leave = TRUE;
				if( c != '"' && c != '\'' ) {
					c = ' ';
				} else {
					tmp = g_utf8_next_char( tmp );
				}
				end = g_utf8_strchr( tmp, -1, c );
				if( ! end ) {
					end = temp + length;
				}
				url = g_strndup( tmp, end - tmp );
			
				if( ! update_all ) {
					match = paths_match( pageroot, 
							     url,
							     source );
					newpath = relative_path( dest,
								 pageroot );
					if( isdir  && match ) {
						gchar *temp;
						
						temp = newpath;
						
						newpath = g_strconcat( newpath,
								       match + strlen( source ), NULL );
						g_free( temp );
					}
				} else {
					match = relative_to_full( url,
								  pageroot );
					newpath = relative_path( match,
								 destroot );
				}
				if( match ) {
					g_string_append_unichar( newdata, c );
					g_string_append( newdata,
							 newpath );
				
					g_free( newpath );
					
					leave = FALSE;
					g_free( match );
				}
				tmp += strlen( url );
				g_free( url );

				if( leave ) {
					g_string_append_len( newdata,
							     temp,
							     tmp - temp );
					temp = tmp;
				}

				break;
			}
		}

		temp = tmp;
	}
	/* now find any other url() instances */
	while( ( tmp = strstr( temp, "url(" ) ) ) {

		tmp += strlen( "url(" );
		g_string_append_len( newdata, temp, tmp - temp );
		temp = tmp;
		
		while( *tmp != '\0' ) {
			gchar *url;
			
			g_string_append_len( newdata, temp, tmp - temp );
			temp = tmp;

			c = g_utf8_get_char( tmp );
			if( ! g_unichar_isspace( c ) ) {
				/* found url */
				gchar *end;
				gboolean leave;
				gchar *newpath;

				leave = TRUE;
				c = g_utf8_get_char( tmp );
				if( c != '"' && c != '\'' ) {
					c = ' ';
				} else {
					tmp = g_utf8_next_char( tmp );
				}
				end = g_utf8_strchr( tmp, -1, c );
				if( ! end ) {
					end = temp + length;
				}
				url = g_strndup( tmp, end - tmp );
			
				if( ! update_all ) {
					match = paths_match( pageroot, 
							     url,
							     source );
					newpath = relative_path( dest,
								 pageroot );
					if( isdir  && match ) {
						gchar *temp;
						
						temp = newpath;
						
						newpath = g_strconcat( newpath,
								       match + strlen( source ), NULL );
						g_free( temp );
					}
				} else {
					match = relative_to_full( url,
								  pageroot );
					newpath = relative_path( match,
								 destroot );
				}
				if( match ) {
					g_string_append_unichar( newdata, c );
					g_string_append( newdata,
							 newpath );
					
					g_free( newpath );
					
					leave = FALSE;
					g_free( match );
				}
				tmp += strlen( url );
				g_free( url );

				if( leave ) {
					g_string_append_len( newdata,
							     temp,
							     tmp - temp );
					temp = tmp;
				}
				break;
			} else {
				tmp ++;
			}
		}
		temp = tmp;
	}

	g_string_append( newdata, temp );
	
	if( strcmp( newdata->str, data ) ) {
		screem_page_set_data( page, newdata->str );
	}
	g_string_free( newdata, TRUE );
	g_free( data );

	g_free( destroot );
	g_free( pageroot );
}

void screem_markup_update_from_template( ScreemSite *site,
					 ScreemPage *template_page,
					 ScreemPage *page,
					 const gchar *template_tag,
					 GSList *blocks )
{
	gchar *text;
	const gchar *start;
	gchar *end;
	GString *newdata;
	GSList *first_block;
	
	const gchar *template_path;
	const gchar *page_path;
	
	if( ! screem_page_load( page, NULL ) ) {
		return;
	}
		
	template_path = screem_page_get_pathname( template_page );
	page_path = screem_page_get_pathname( page );
	
	first_block = blocks;
	
	text = screem_page_get_data( page );
	
	/* if template_tag occurs in text then we need to update page to
		match the changes in the template */
	start = strstr( text, template_tag );
	if( ! start ) {
		g_free( text );
		return;
	}
	
	newdata = g_string_new( NULL );
	start += strlen( template_tag );
	
	g_string_append_len( newdata, text, start - text );
	
	/* chop text off where we find <!-- #EndTemplate --> */
	end = strstr( start, "<!-- #EndTemplate -->" );
	if( end ) {
		*end = '\0';
	}

	/* start now holds the section of the page which is our template */
	while( blocks ) {
		const gchar *reg_start;
		
		g_string_append( newdata, blocks->data );
		blocks = blocks->next;
		
		reg_start = NULL;
		if( start ) {
			reg_start = screem_markup_get_region( start, 
							      blocks );
		}
		
		if( ! reg_start && blocks ) {
			/* a new region was added to the template */
			reg_start = start;
		} else {
			start = reg_start;
			reg_start = NULL;
		}
		
		
		if( blocks ) {
			blocks = blocks->next;
		}
		if( start && ( start != reg_start ) ) {
			gchar *endblock;
			
			start += strlen( "-->" );
			endblock = strstr( start, "<!-- #EndEditable -->" );
			if( endblock ) {
				g_string_append_len( newdata, start, 
						     endblock - start );
			}
			start = endblock;
		}
	}
	
	if( end ) {
		g_string_append_c( newdata, '<' );
		g_string_append( newdata, end + 1 );
	}
	
	screem_page_set_data( page, newdata->str );
	
	g_free( text );
	g_string_free( newdata, TRUE );

	/* the links may now need fixing */
	screem_markup_fix_links( site, page, page_path,
				 template_path,
				 NULL, NULL );
}

static const gchar *screem_markup_get_region( const gchar *text, 
						GSList *blocks )
{
	const gchar *start;
	gchar *ed_name;
	const gchar *reg_start;
	
	if( ! blocks ) {
		return NULL;
	}
	reg_start = NULL;
	start = text;
	
	while( start ) {
		gint len;

		start = strstr( start, "<!-- #BeginEditable" );
		reg_start = start;

		ed_name = NULL;
		
		if( start ) {
			ed_name = find_text( start, "\"[^\"]*\"",
						NULL, &len );
			start = strstr( start, "-->" );
			reg_start = start;
		}
		if( ed_name && ! strncmp( ed_name, blocks->data, len ) ) {
			break;
		}
	}
	
	return  reg_start;
}

gchar *screem_markup_basic_html( ScreemDTDDB *db, const gchar *doctype,
				gboolean frameset, gboolean xhtml )
{
	ScreemDTD *dtd;
	gchar *root;
	gchar *systemid;

	gchar *text;
	gchar *tmp;
	gchar *spaces;
	gfloat tabwidth;
	GConfClient *client;
	
	const gchar *ntext = "\
<html>\n\
\t<head>\n\
\t\t<title></title>\n\
\t</head>\n\
\t<body>\n\
\t\t\n\
\t</body>\n\
</html>\n";
	const gchar *xtext = "\
<html xmlns=\"http://www.w3.org/1999/xhtml\">\n\
\t<head>\n\
\t\t<title></title>\n\
\t</head>\n\
\t<body>\n\
\t\t\n\
\t</body>\n\
</html>\n";
	const gchar *frame = "\
<frameset rows=\"*\" cols=\"*\">\n\
\t<frame src=\"\" frameborder=\"1\" name=\"\" scrolling=\"auto\"></frame>\n\
\t<noframes>\n\
\t\t<body>\n\
\t\t\n\
\t\t</body>\n\
\t</noframes>\n\
</frameset>\n";
	if( ! frameset ) {
		if( xhtml ) {
			ntext = xtext;
		}
	} else {
		ntext = frame;
	}

	client = gconf_client_get_default();
	tabwidth = gconf_client_get_float( client,
			"/apps/screem/editor/tabwidth",
			NULL );
	if( tabwidth == 0.0 ) {
		tabwidth = 8.0;
	}
	if( gconf_client_get_bool( client,
				"/apps/screem/editor/spaces_instead_of_tabs",
				NULL ) ) {
		spaces = g_new0( gchar, (gint)(tabwidth + 1) );
		memset( spaces, ' ', tabwidth );
		tmp = find_text( ntext, "\t", spaces, NULL );
 		g_free( spaces );
	} else {
		tmp = g_strdup( ntext );
	}
	g_object_unref( client );
	
	dtd = screem_dtd_db_get_dtd( db, doctype, NULL ); 
	root = screem_dtd_db_get_root( db, doctype, NULL );
	if( ! root ) {
		root = g_strdup( "unknown" );
	}
	systemid = screem_dtd_db_get_system_id( db,
					     doctype, NULL );
	text = g_strconcat( "<!DOCTYPE ", root,
				   " PUBLIC \"", doctype, 
				   "\" \"", systemid,
				   "\">", "\n", tmp, NULL );
	g_free( tmp );
	g_free( systemid );
	g_free( root );

	return text;
}

gchar *screem_markup_get_charset( const gchar *text, 
		gint *retpos, gint *retlen )
{
	guint len;
	guint pos;

	guint start;
	guint end;
	gchar *tag;
	gchar *name;	
	
	gchar *ret;
	
	len = strlen( text );
	pos = 0;

	ret = NULL;

	/* first we look for <?xml encoding="" ?>
	   then we check <meta http-equiv="content-type" />
	   then we check <meta name="charset" /> 
	   if we have an <?xml ?> without an encoding="" we
	   enforce UTF-8 no matter what <meta /> exist */
	
	while( ( ! ret ) && ( pos < len ) &&
		( tag = screem_markup_next_tag( text, pos,
					       &start, &end, 
					       &name ) ) ) {
		g_assert( name );
		pos = end;
		if( ! strcmp( "?xml", name ) ) {
			/* xml PI element */
			GSList *list;
			GSList *attr;

			list = screem_markup_build_attributes_list( tag,
					NULL );
			for( attr = list; attr; attr = attr->next ) {
				if(! strcmp( "encoding", 
					     attr->next->data ) &&
				   attr->data ) {
					ret = g_strdup( attr->data );
					break;
				} else {
					attr = NULL;
					break;
				}
				attr = attr->next;
			}
				
			if( ! attr ) {
				ret = g_strdup( "UTF-8" );
			}

			g_slist_foreach( list, (GFunc)g_free, NULL );
			g_slist_free( list );
		} else if( ! g_strcasecmp( "meta", name ) ) {
			/* meta element */
			GSList *list;
			GSList *attr;
			gboolean content;
			
			list = screem_markup_build_attributes_list( tag,
					NULL );
			content = FALSE;

			for( attr = list; attr; attr = attr->next ) {
				if( ! strcmp( "http-equiv", 
					      attr->next->data ) && 
				    attr->data &&
				    ! strcmp( "content-type",
					      attr->data ) ) {
					content = TRUE;
					break;
				} else if(! strcmp( "name", 
						    attr->next->data) &&
					  attr->data && 
					  ! strcmp( "charset",
						    attr->data)){
					break;
				}
				attr = attr->next;
			}
			if( attr ) {
				for( attr = list; attr; 
				     attr = attr->next ) {
					if( ! strcmp( "content",
						      attr->next->data ) ) {
						break;
					}
					attr = attr->next;
				}
			}
			if( attr && attr->data && content ) {
				gchar *val;

				val = strstr( attr->data, "charset=" );
				if( val ) {
					GString *tmp;

					val += strlen( "charset=" );
					tmp = g_string_new( NULL );
					while( *val != ' ' &&
					       *val != '\0' &&
					       *val != '"' ) {
						g_string_append_c( tmp,
							   *val);
						val ++;
					}
					ret = g_strdup( tmp->str );
					g_string_free( tmp, TRUE );
				}
			} else if( attr ) {
				ret = g_strdup( attr->data );
			}

			g_slist_foreach( list, (GFunc)g_free, NULL );
			g_slist_free( list );
		} else if( g_strcasecmp( "html", name ) &&
			   g_strcasecmp( "head", name ) &&
			   g_strcasecmp( "title", name ) &&
			   g_strcasecmp( "link", name ) &&
			   g_strcasecmp( "script", name ) &&
			   g_strcasecmp( "style", name ) &&
			   g_strcasecmp( "!--", name ) &&
			   g_strcasecmp( "base", name ) &&
			   g_strcasecmp( "isindex", name ) &&
			   g_strcasecmp( "!DOCTYPE", name ) &&
			   *name != '/' && *name != '?' ) {
			/* limiting case, bit of a hack for html,
			   valid xml docs are supposed to have the
			   <?xml ?> element and have indeterminate 
			   element names so we can't stop if the doc is
			   invalid */
			pos = len;
		}
		g_free( tag );
		g_free( name );
	}

	if( ret ) {
		if( retpos ) {
			*retpos = start;
		}
		if( retlen ) {
			*retlen = end - start;
		}
	} else {
		if( retpos ) {
			*retpos = 0;
		}
		if( retlen ) {
			*retlen = len;
		}
	}

	return ret;
}

