p4wStrBuf.cpp #1

//
// Copyright 2001 Perforce Software.  All rights reserved.
//
// This file is part of Perforce - the FAST SCM System.
//
// p4wStrBuf:
//   StrBuf with the p4w-specific ::Expand() modifications.
//   Also include utilities for escaping and unescaping characters
//   in urls and html.

#include <p4wp4.h>

#include "p4wStrBuf.h"
#include "validate.h"


// -------------------------------------
// p4w's ::Expand function.
//
p4wStrBuf& p4wStrBuf::Expand(const StrPtr &s, StrDict &d, int isUnicode, int bForceWrap)
{
	//
	// Used for GetVar(var)
	return DoExpand(s, d, 0, 0, 0, isUnicode, bForceWrap);
}

p4wStrBuf& p4wStrBuf::ExpandList(const StrPtr &s, StrDict &d, int x, int isUnicode, int bForceWrap)
{
	//
	// Used for GetVar(var, x) to get values from a list
	return DoExpand(s, d, 1, x, 0, isUnicode, bForceWrap);
}

p4wStrBuf& p4wStrBuf::ExpandArray(const StrPtr &s, StrDict &d, int x, int y,
int isUnicode)
{
	//
	// Used for GetVar(var, x, y) to get values from a list of a list item
	return DoExpand(s, d, 2, x, y, isUnicode);
}

p4wStrBuf& p4wStrBuf::DoExpand(const StrPtr &s, StrDict &d, int nargs, int x,
int y, int isUnicode, int bForceWrap)
{
	// General vars.
	StrBuf		var;	// the %var% variable
	StrPtr		*val;	// var's value

	const char	*p = s.Text();		// our current location in s
	const char	*start;				// a pointer to the first % in a pair
	const char	*end;				// a pointer to the last % in a pair
	const char	*option_start;		// a pointer to the first option in a pair


	//
	// Search through the string until we either run out of string or
	// find the last %.
	while( (start = strchr(p, '%')) != NULL ) {
		//
		// Locate the terminating %.
		end = strchr(start + 1, '%');

		if( end == NULL ) {
			//
			// No terminating %; this is an error, but we'll exit
			// gracefully.
			break;
		} else if( end == start + 1 ) {
			//
			// %% found; insert a single %.
			Append( p, end - p );
			p = end + 1;
		} else {
			//
			// We found a complete %var% token.  Extract the variable name.
			// First, we need to figure out where the variable name ends and
			// any options begin.
			if( (option_start = (char *)memchr(start + 1, '@', end - start - 1)) == NULL )
				if( (option_start = (char *)memchr(start + 1, '?', end - start - 1)) == NULL )
					option_start = end;

			//
			// Add everything before the start of the pair to our buffer and
			// update our pointer so that it points after this variable.
			Append(p, start - p);
			p = end + 1;

			//
			// Get the value for this variable.
			var.Set(start + 1, option_start - start - 1);
			switch( nargs ) {
				case 0:	// default case
					val = d.GetVar(var);
					break;
				case 1: // with 1 int arg
					val = d.GetVar(var, x);
					break;
				case 2: // with 2 int args
					val = d.GetVar(var, x, y);
					break;
			}

			//
			// If we didn't get a value, see if we were given a '?' option
			// and use that instead.
			if( val == NULL ) {
				if( (option_start = (char *)memchr(option_start, '?', end - option_start)) != NULL )
					Append(option_start + 1, end - option_start - 1);
					continue;
			}

			//
			// We were given a value.  See if we have to process any options
			// for it.
			switch( *option_start ) {
				case '%': // append and be done.
					EscapeHTML(*val, isUnicode, bForceWrap);
					break;
				case '@': // convert time.
					//
					// Start a new time_format buffer.
					StrBuf time_format;
					char time_buffer[255 + 1];
					time_t t;

					//
					// Convert all *'s to %'s.  Convert **'s to *'s while
					// we're at it.
					for( const char * time_p = option_start + 1; time_p != end; time_p++ ) {
						//
						// Is this a *?
						if( *time_p == '*' ) {
							// Does it have a trailing *?
							if( (time_p + 1 != end) && (*(time_p + 1) == '*') ) {
								time_format.Append("*");
								time_p++; // skip the trailing *
							} else
								time_format.Append("%");
						} else {
							time_format.Append(time_p, 1);
						}
					}

					//
					// strftime()
					t = atoi(val->Text());
					if( t < 0 )
						break;

					if( strftime(time_buffer, sizeof(time_buffer), time_format.Text(), localtime(&t)) != 0 )
						Append(time_buffer);
					break;
			}
		}
	}

	//
	// Append the rest of the string to our buffer and return the buffer.
	Append(p);
	return *this;
}

// helper function to assure the pointer is looking at a utf-8 follow-on char
bool utf8Next(const char* p)
{
    // should be 10xxxxxx
    return (((unsigned char)(*p)) & 0xC0) == 0x80;
}

// return the UTF-8 value and adjust p for the number of extra bytes we consumed
int decodeUtf8(const char* &p)
{
    unsigned char uP = *p;
    if (uP < 192)
        return uP;

    int ret = 0;

    // in utf-8 number of leading 1's indicates byte count
    // 110x xxxx >= 192
    // 1110 xxxx >= 224
    // 1111 0xxx >= 240
    // the rest of the bytes are all 10xx xxxx

    if (uP >= 192 && uP < 224)
    {
        if (!utf8Next(p+1))
            return uP;

        // 2 bytes - 110yyyxx 10xxxxxx
        ret = (((*p  ) & 0x1F) << 6) +
               (*(p+1) & 0x3F);
        p += 1;
    }
    else if (uP < 240)
    {
        if (!utf8Next(p+1) || !utf8Next(p+2))
            return uP;

        // 3 bytes - 1110yyyy 10yyyyxx 10xxxxxx
        ret = (((*p  ) & 0x0F) << 12) +
              ((*(p+1) & 0x3F) << 6) +
               (*(p+2) & 0x3F);
        p += 2;
    }
    else
    {
        if (!utf8Next(p+1) || !utf8Next(p+2) || !utf8Next(p+3))
            return uP;

        // 4 bytes - 11110zzz 10zzyyyy 10yyyyxx 10xxxxxx
        ret = (((*p  ) & 0x07) << 18) +
              ((*(p+1) & 0x3F) << 12) +
              ((*(p+2) & 0x3F) << 6) +
               (*(p+3) & 0x3F);
        p += 3;
    }

    return ret;
}

// -------------------------------------
// Escape <>&'s to &gt; &lt; and &amp;
//
p4wStrBuf& p4wStrBuf::EscapeHTML(const StrPtr &s, int isUnicode, int bForceWrap)
{
    bool validUtf8 = false;
    if (!isUnicode)
    {
        CharSetUTF8Valid validator;
        validUtf8 = (validator.Valid(s.Text(), s.Length()) == 1);
    }

	//
	// Search for, and escape, <>'s.
	int i=0;
	for( const char * p = s.Text(); *p != '\0'; p++, i++ ) {
		unsigned char first = *p;

		//
		// See if we need to escape this character.
		if( first == '<' ) {
			Append("&lt;");
		} else if( first == '>' ) {
			Append("&gt;");
		} else if( first == '&' ) {
			Append("&amp;");
		} else if( first == '"' ) {
			Append("&quot;");
		} else if( first == '\'' ) {
			Append("&#x27;");
		} else if( first == '/' ) {
			Append("&#x2F;");
		} else if( bForceWrap && first == '\n' ) {
			Append("<br>");
		} else if ( first > 0x7F && !isUnicode ) {
			//
			// If it is outside of ascii, just use the numeric
			// value
			//
			Append( "&#" );

            // for security reasons, only decode valid Utf8
			if (validUtf8)
                *this << (unsigned int)decodeUtf8(p);
            else
                *this << (unsigned int)first;

			Append( ";" );
		} else {
			Append(p, 1);
		}
	}

	//
	// We're done.
	return *this;
}

// -------------------------------------
// Escape <>&'s to &gt; &lt; and &amp and also convert "http://"
// into real urls using "a href".
//
p4wStrBuf& p4wStrBuf::EscapeHTMLDoURLs(const StrPtr &s, int unicode)
{
    bool validUtf8 = false;
    if (!unicode)
    {
        CharSetUTF8Valid validator;
        validUtf8 = (validator.Valid(s.Text(), s.Length()) == 1);
    }

	//
	// Search for, and escape, <>'s. If "http:" is found,
	// convert it to a real url

	StrBuf urlText;

	//
	// Use a lower-case copy of this string in order to search for
	// "http:" using case-sensitive match.

	StrBuf lowered;
	lowered.Set(s.Text());
	StrOps::Lower(lowered);
	const char *pl = lowered.Text();

	for( const char * p = s.Text();
	*p != '\0'; p++, pl++ ) {

		unsigned char first = *p;

		//
		// See if we need to escape this character.

		if( first == '<' ) {
			Append("&lt;");

		} else if( first == '>' ) {
			Append("&gt;");

		} else if( first == '&' ) {
			Append("&amp;");

		//
		// Handle the case where we've found "http: or http:
		// or "https: or https:
		// by wrapping it within <A Href> directives to make it
		// an url

		} else if(
		( ( first == '"' ) && ( !strncmp( pl, "\"http:", 6 ) ||
					!strncmp( pl, "\"https:", 7 ) ) )
		|| ( ( *pl == 'h' ) && ( !strncmp( pl, "http:", 5 ) ||
					!strncmp( pl, "https:", 6 ) ) ) ) {
			int fq = 0;
			int eq = 0;
			int l;
			if( first == '"' )
				fq = 1;

			//
			// Found "http:" or "https:". Insure we handle case
			// correctly whether the url part is quoted or not.

			if( fq ) {
				if( !strncmp( pl, "\"http:", 6 ) )
					l = 6;
				else
					l = 7;
				Append("<a href=");

			} else {
				if( !strncmp( pl, "http:", 5 ) )
					l = 5;
				else
					l = 6;
				Append("<a href=\"");
			}
			Append(p, l);
			urlText.Set(p, l);
			p += l;
			pl += l;

			//
			// Look for the end of the url. It can be
			// terminated by a quotation mark if it was
			// started with one. Otherwise, it can be
			// terminated by a space, non-ascii, or
			// newline character.

			const char *t = p;
			int ap = 0;
			for( ; *t != '\0'; t++ ) {
				first = *t;

				if( fq && first == '"' ) {
					eq = 1;
					urlText.Append(t, 1);
					break;
				}

				if( !fq && first == ' ' ) {
					++ap;
					break;
				}

				if( first > 0x7F && !unicode ) {
					++ap;
					break;
				}

				if( first == '\n' ) {
					++ap;
					break;
				}

				Append(t, 1);
				urlText.Append(t, 1);
			}

			//
			// url has ended so end the <a href>, output
			// the text, and generate the </a>. This should
			// finish the html needed to generate this url.

			Append("\">");
			Append(urlText.Text());
			Append("</a>");

			if( *t == '\0' )
				break;

			//
			// Don't forget to output the character that terminated
			// the url, after the url has been generated.

			if( ap )
				Append(t, 1);

			pl += ( t - p );
			p = t;

		} else if( first == '"' ) {
			Append("&quot;");

		} else if ( first > 0x7F && !unicode ) {
			//
			// If it is outside of ascii, just use the numeric
			// value unless this is unicode mode
			//
			Append("&#");
            const char* origP = p;

            // for security reasons, only decode valid Utf8
            if (validUtf8)
                *this << (unsigned int)decodeUtf8(p);
            else
                *this << (unsigned int)first;

            pl += p - origP;
			Append(";");

		} else {
			Append(p, 1);
		}
	}

	//
	// We're done.
	return *this;
}

// -------------------------------------
// Escape and unescape URL's.
//
p4wStrBuf& p4wStrBuf::EscapeURL(const StrPtr &s, int isUnicode)
{

	//
	// Escape a subset of the reserved and unsafe
	// characters in urls.  We do not escape the
	// following characters because we want them
	// to retain their special characteristics:
	// :?@=&#\.

	const char		* q;
	char			val[4];


	// Reserved and unsafe characters.
	static const char * reservedChars = ";";
	static const char * reservedCharsEncoding[] = {
		"%3B"
	};

	static const char * unsafeChars = "<>\"%{}|^~[]`";
	static const char * unsafeCharsEncoding[] = {
		"%3C", "%3E", "%22", "%25", "%7B", "%7D", "%7C",
		"%5E", "%7E", "%5B", "%5D", "%60"
	};


	//
	// Search for, and escape, reserved and unsafe characters.
	// Note that a leading space should also be escaped, even
	// though it is not strictly considered reserved nor unsafe,
	// otherwise it won't work correctly when combined with base.
	int first = 1;
	for( const char * p = s.Text(); *p != '\0'; p++ ) {
		//
		// See if we need to escape this character.
		if( first && *p == ' ' ) {
			Append("%20");
		} else if( (q = strchr(reservedChars, *p)) != NULL ) {
			Append(reservedCharsEncoding[q - reservedChars]);
		} else if( (q = strchr(unsafeChars, *p)) != NULL ) {
			Append(unsafeCharsEncoding[q - unsafeChars]);
		} else if( (unsigned char)*p > 127 && !isUnicode ) {
			sprintf(val, "%%%x", (unsigned char)*p);
			Append(val);
		} else {
			Append(p, 1);
		}
		first = 0;
	}

	//
	// We're done.
	return *this;
}

p4wStrBuf& p4wStrBuf::EscapeSpaces(const StrPtr &s)
{
	//
	// Search for, and escape, spaces.
	for( const char * p = s.Text(); *p != '\0'; p++ ) {
		//
		// See if we need to escape this character.
		if( *p == ' ' ) {
			Append("%20");
		} else {
			Append(p, 1);
		}
	}

	//
	// We're done.
	return *this;
}

p4wStrBuf& p4wStrBuf::EscapeURLAllChars(const StrPtr &s, int isUnicode)
{

	//
	// Escape all reserved and unsafe
	// characters in urls. This differs from EscapeURL
	// in that we don't exclude escaping characters
	// which are special for p4web.

	const char		* q;
	char			val[4];


	// Reserved and unsafe characters.
	static const char * reservedChars = ";/?:@=&";
	static const char * reservedCharsEncoding[] = {
		"%3B", "%2F", "%3F", "%3A", "%40", "%3D", "%26"
	};

	static const char * unsafeChars = "<>\"\'#%{}|\\^~[]`";
	static const char * unsafeCharsEncoding[] = {
		"%3C", "%3E", "%22", "%27", "%23", "%25", "%7B", "%7D", "%7C",
		"%5C", "%5E", "%7E", "%5B", "%5D", "%60"
	};


	//
	// Search for, and escape, reserved and unsafe characters.
	// Note that a leading space should also be escaped, even
	// though it is not strictly considered reserved nor unsafe,
	// otherwise it won't work correctly when combined with base.
	int first = 1;
	for( const char * p = s.Text(); *p != '\0'; p++ ) {
		//
		// See if we need to escape this character.
		if( first && *p == ' ' ) {
			Append("%20");
		} else if( (q = strchr(reservedChars, *p)) != NULL ) {
			Append(reservedCharsEncoding[q - reservedChars]);
		} else if( (q = strchr(unsafeChars, *p)) != NULL ) {
			Append(unsafeCharsEncoding[q - unsafeChars]);
		} else if( (unsigned char)*p > 127 && !isUnicode) {
			sprintf(val, "%%%x", (unsigned char)*p);
			Append(val);
		} else {
			Append(p, 1);
		}
		first = 0;
	}

	//
	// We're done.
	return *this;
}

p4wStrBuf& p4wStrBuf::UnescapeURL(const StrPtr &s, int plus2sp)
{
	//
	// Search for, and unescape, reserved and unsafe characters.
	for( const char * p = s.Text(); *p != '\0'; p++ ) {
		//
		// See if we need to unescape this character.
		if( (*p == '%') && (*(p + 1) != '\0') && (*(p + 2) != '\0') ) {
			// Convert the character.
			char tempHex[3] = { '\0', '\0', '\0' };
			memmove(tempHex, p + 1, 2);
			tempHex[0] = (char)strtol(tempHex, (char **)NULL, 16);
			Append(tempHex, 1);

			// Skip over it.
			p += 2; // the other + 1 will happen in the loop
		} else if(plus2sp && *p == '+') {
			Append(" ", 1);
		} else {
			Append(p, 1);
		}
	}

	//
	// We're done.
	return *this;
}

//
// Escape ampersands only

p4wStrBuf& p4wStrBuf::EscapeAmp(const StrPtr &s)
{
	//
	// Search for, and escape, ampersands

	if( !strchr( s.Text(), '&' ) ) {
		Append( s.Text() );
		return *this;
	}

	for( const char * p = s.Text(); *p != '\0'; p++ ) {
		//
		// See if we need to escape this character.
		if( *p == '&' ) {
			Append("%26");
		} else {
			Append(p, 1);
		}
	}

	//
	// We're done.
	return *this;
}

p4wStrBuf& p4wStrBuf::UnescapeAmp(const StrPtr &s)
{
	//
	// Search for, and unescape escaped ampersands

	for( const char * p = s.Text(); *p != '\0'; p++ ) {
		//
		// See if we need to unescape this character,
		// only if it is an escaped ampersand

		if( ( *p == '%' ) && ( *( p + 1 ) != '\0' )
		&& ( *( p + 2 ) != '\0' )
		&& ( *( p + 1 ) == '2' ) && ( *( p + 2 ) == '6' ) ) {

			// Convert the character
			char tempHex[3] = { '\0', '\0', '\0' };
			memmove( tempHex, p + 1, 2 );
			tempHex[0] = (char)strtol( tempHex,
				(char **)NULL, 16 );
			Append( tempHex, 1 );

			// Skip over it.
			p += 2; // the other + 1 will happen in the loop

		} else {
			Append( p, 1 );
		}
	}

	//
	// We're done.
	return *this;
}

//
// Escape double quotes only

p4wStrBuf& p4wStrBuf::EscapeDQuotes(const StrPtr &s)
{
	//
	// Search for, and escape, double quotes

	if( !strchr( s.Text(), '"' ) ) {
		Append( s.Text() );
		return *this;
	}

	for( const char * p = s.Text(); *p != '\0'; p++ ) {
		//
		// See if we need to escape this character.
		if( *p == '"' ) {
			Append("%22");
		} else {
			Append(p, 1);
		}
	}

	//
	// We're done.
	return *this;
}


p4wStrBuf& p4wStrBuf::NormalizeBase(const StrPtr &s, int isUnicode)
{
	//
	// Normalize base by escaping characters special to urls.

	if( !s.Length() )
		return *this;

	Append( ( p4wStrBuf().EscapeURL( StrRef(s.Text()), isUnicode ) ).Text() );

	return *this;
}

p4wStrBuf& p4wStrBuf::StripBlanks(const StrPtr &s)
{
	//
	// Strip leading and trailing blanks from string

	const char *p;
	const char *e;

	if( !s.Length() )
		return *this;

	for( p = s.Text(); *p; p++ ) {

		if( *p != ' ' )
			break;
	}

	if( !*p )
		return *this;

	for( e = p + strlen( p ) - 1; e > p; e-- ) {

		if( *e != ' ' )
			break;
	}

	Append( p, e - p + 1 );

	return *this;
}

p4wStrBuf& p4wStrBuf::EscapeP4Chars( const StrPtr &s )
{

	//
	// Escape characters which have special meaning
	// to p4, but are now allowed in filenames.
	// These characters are @,#,% and *.

	const char		* q;
	char			val[4];

	static const char * unsafeChars = "@#%*";
	static const char * unsafeCharsEncoding[] = {
		"%40", "%23", "%25", "%2A"
	};

	//
	// Search for, and escape, p4's unsafe characters.
	for( const char * p = s.Text(); *p != '\0'; p++ ) {
		//
		// See if we need to escape this character.
		if( ( q = strchr( unsafeChars, *p ) ) != NULL ) {
			Append( unsafeCharsEncoding[q - unsafeChars] );

		} else {
			Append( p, 1 );
		}
	}

	//
	// We're done.
	return *this;
}

p4wStrBuf& p4wStrBuf::UnescapeP4Chars(const StrPtr &s)
{
	//
	// Search for, and unescape characters that have special
	// meaning to p4, specifically @,%, *, %

	for( const char * p = s.Text(); *p != '\0'; p++ ) {
		//
		// See if we need to unescape this character sequence.
		if( ( *p == '%' ) && ( *( p + 1 ) != '\0' ) &&
		( *( p + 2 ) != '\0' ) ) {

			if( ( *( p + 1 ) == '4' && *( p + 2 ) == '0' ) ||
			    ( *( p + 1 ) == '2' && *( p + 2 ) == '3' ) ||
			    ( *( p + 1 ) == '2' && *( p + 2 ) == '5' ) ||
			    ( *( p + 1 ) == '2' && *( p + 2 ) == 'A' ) ) {
				// Convert the character.
				char tempHex[3] = { '\0', '\0', '\0' };
				memmove(tempHex, p + 1, 2);
				tempHex[0] = (char)strtol(tempHex,
				(char **)NULL, 16);
				Append(tempHex, 1);

				// Skip over it.
				p += 2; // the other + 1 will happen in the loop
			} else {
				Append( p, 1 );
			}
		} else {
			Append( p, 1 );
		}
	}

	//
	// We're done.
	return *this;
}
#	Change	User	Description
#1	12234	Matt Attaway	Rejigger P4Web project in preparation for official sunsetting The bin directory contains the last official builds of P4Web from the Perforce download site. P4Web is soon to be completely sunsetted; these builds are here for folks who don't want to build their own. To better handle the archived builds the source code has been moved into a separate src directory.
//guest/perforce_software/p4web/Main/p4wStrBuf.cpp
#1	8914	Matt Attaway	Initial add of the P4Web source code