sxmlc Git

Simple, lightweight XML parser in C, statically or dynamically linked.

Brought to you by: matthieu-labas
[41d36b]: / src / utils.c Maximize Restore History
416 lines (340 with data), 11.1 kB

/*
    This file is part of sxmlc.

    sxmlc is free software: you can redistribute it and/or modify
    it under the terms of the GNU Lesser General Public License as published by
    the Free Software Foundation, either version 3 of the License, or
    (at your option) any later version.

    sxmlc is distributed in the hope that it will be useful,
    but WITHOUT ANY WARRANTY; without even the implied warranty of
    MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
    GNU Lesser General Public License for more details.

    You should have received a copy of the GNU Lesser General Public License
    along with sxmlc.  If not, see <http://www.gnu.org/licenses/>.

	Copyright 2010 - Matthieu Labas
*/
#if defined(WIN32) || defined(WIN64)
#pragma warning(disable : 4996)
#endif

#include <stdio.h>
#include <stdlib.h>
#include <string.h>
#include <ctype.h>
#include "utils.h"

/* Dictionary of special characters and their HTML equivalent */
static struct _html_special_dict {
	char chr;		/* Original character */
	char* html;		/* Equivalent HTML string */
	int html_len;	/* 'strlen(html)' */
} HTML_SPECIAL_DICT[] = {
	{ '<', "&lt;", 4 },
	{ '>', "&gt;", 4 },
	{ '"', "&quot;", 6 },
	{ '&', "&amp;", 5 },
	{ '\0', NULL, 0 }, /* Terminator */
};

int _bgetc(DataSourceBuffer* ds)
{
	if (ds == NULL || ds->buf[ds->cur_pos] == '\0') return EOF;
	
	return (int)(ds->buf[ds->cur_pos++]);
}

int _beob(DataSourceBuffer* ds)
{

	if (ds == NULL || ds->buf[ds->cur_pos] == '\0') return true;

	return false;
}

int read_line_alloc(void* in, DataSourceType in_type, char** line, int* sz_line, int i0, char from, char to, int keep_fromto, char interest, int* interest_count)
{
	int init_sz = 0;
	char c, *pt;
	int n, ret;
	int (*mgetc)(void* ds) = (in_type == DATA_SOURCE_BUFFER ? (int(*)(void*))_bgetc : (int(*)(void*))fgetc);
	int (*meos)(void* ds) = (in_type == DATA_SOURCE_BUFFER ? (int(*)(void*))_beob : (int(*)(void*))feof);
	
	if (in == NULL || line == NULL) return 0;
	
	if (to == '\0') to = '\n';
	/* Search for character 'from' */
	if (interest_count != NULL) *interest_count = 0;
	while (true) {
		c = (char)mgetc(in);
		if (interest_count != NULL && c == interest) (*interest_count)++;
		/* Reaching EOF before 'to' char is not an error but should trigger 'line' alloc and init to '' */
		/* If 'from' is '\0', we stop here */
		if (c == from || c == CEOF || from == '\0') break;
	}
	
	if (sz_line == NULL) sz_line = &init_sz;
	
	if (*line == NULL || *sz_line == 0) {
		if (*sz_line == 0) *sz_line = MEM_INCR_RLA;
		*line = (char*)malloc(*sz_line);
		if (*line == NULL) return 0;
	}
	if (i0 < 0) i0 = 0;
	if (i0 > *sz_line) return 0;
	
	n = i0;
	if (c == CEOF) { /* EOF reached before 'to' char => return the empty string */
		(*line)[n] = '\0';
		return meos(in) ? n : 0; /* Error if not EOF */
	}
	if (c != from || keep_fromto)
		(*line)[n++] = c;
	(*line)[n] = '\0';
	ret = 0;
	while (true) {
		c = (char)mgetc(in);
		if (interest_count != NULL && c == interest) (*interest_count)++;
		if (c == CEOF) { /* EOF or error */
			(*line)[n] = '\0';
			ret = meos(in) ? n : 0;
			break;
		}
		else {
			(*line)[n] = c;
			if (c != to || (keep_fromto && to != '\0' && c == to)) n++; /* If we reached the 'to' character and we keep it, we still need to add the extra '\0' */
			if (n >= *sz_line) { /* Too many characters for our line => realloc some more */
				*sz_line += MEM_INCR_RLA;
				pt = (char*)realloc(*line, *sz_line);
				if (pt == NULL) {
					ret = 0;
					break;
				}
				else
					*line = pt;
			}
			(*line)[n] = '\0'; /* If we reached the 'to' character and we want to strip it, 'n' hasn't changed and 'line[n]' (which is 'to') will be replaced by '\0' */
			if (c == to) {
				ret = n;
				break;
			}
		}
	}
	
#if 0 /* Automatic buffer resize is deactivated */
	/* Resize line to the exact size */
	pt = (char*)realloc(*line, n+1);
	if (pt != NULL)
		*line = pt;
#endif
	
	return ret;
}

/* --- */

char* strcat_alloc(char** src1, const char* src2)
{
	char* cat;
	int n;

	if (src1 == NULL || *src1 == src2) return NULL; /* Do not concatenate '*src1' with itself */

	/* Concatenate a NULL or empty string */
	if (src2 == NULL || *src2 == '\0') return *src1;

	n = (*src1 == NULL ? 0 : strlen(*src1)) + strlen(src2) + 1;
	cat = (char*)realloc(*src1, n);
	if (cat == NULL) return NULL;
	if (*src1 == NULL) *cat = '\0';
	*src1 = cat;
	strcat(*src1, src2);

	return *src1;
}

char* strip_spaces(char* str, char repl_sq, char protect)
{
	char *p;
	int i, len;
	
	if (isspace(protect)) return NULL;
	
	/* 'p' to the first non-space */
	for (p = str; *p && isspace(*p); p++) ; /* No need to search for 'protect' as it is not a space */
	len = strlen(str);
	for (i = len-1; isspace(str[i]); i--) ;
	if (str[i] == protect) i++; /* If last non-space is the protection, keep the last space */
	str[i+1] = '\0'; /* New end of string to last non-space */
	
	if (repl_sq == '\0') {
		if (p == str && i == len) return str; /* Nothing to do */
		for (i = 0; (str[i] = *p) != '\0'; i++, p++) ; /* Copy 'p' to 'str' */
		return str;
	}
	
	/* Squeeze all spaces with 'repl_sq' */
	i = 0;
	while (*p) {
		if (isspace(*p)) {
			str[i++] = repl_sq;
			while (isspace(*++p)) ; /* Skips all next spaces */
		}
		else {
			if (*p == protect) p++;
			str[i++] = *p++;
		}
	}
	str[i] = '\0';
	
	return str;
}

char* str_unescape(char* str)
{
	int i, j;

	if (str == NULL) return NULL;

	for (i = j = 0; str[j]; j++) {
		if (str[j] == '\\') j++;
		str[i++] = str[j];
	}

	return str;
}

int split_left_right(char* str, char sep, int* l0, int* l1, int* i_sep, int* r0, int* r1, int ignore_spaces, int ignore_quotes)
{
	int n0, n1, is;
	char quote = '"';

	if (str == NULL) return false;

	if (i_sep != NULL) *i_sep = -1;

	if (!ignore_spaces) ignore_quotes = false; /* No sense of ignore quotes if spaces are to be kept */

	/* Parse left part */

	if (ignore_spaces) {
		for (n0 = 0; str[n0] && isspace(str[n0]); n0++) ; /* Skip head spaces, n0 points to first non-space */
		if (ignore_quotes && (str[n0] == '"' || str[n0] == '\'')) { /* If quote is found, look for next one */
			quote = str[n0++];
			for (n1 = n0; str[n1] && str[n1] != quote; n1++) {
				if (str[n1] == '\\' && str[++n1] == '\0') break; /* Escape character (can be the last) */
			}
			for (is = n1 + 1; str[is] && isspace(str[is]); is++) ; /* '--' not to take quote into account */
		}
		else {
			for (n1 = n0; str[n1] && str[n1] != sep && !isspace(str[n1]); n1++) ; /* Search for separator or a space */
			for (is = n1; str[is] && isspace(str[is]); is++) ;
		}
	}
	else {
		n0 = 0;
		for (n1 = 0; str[n1] && str[n1] != sep; n1++) ; /* Search for separator only */
		if (str[n1] != sep) return false; /* Separator not found: malformed string */
		is = n1;
	}

	/* Here 'n0' is the start of left member, 'n1' is the character after the end of left member */

	if (l0 != NULL) *l0 = n0;
	if (l1 != NULL) *l1 = n1 - 1;
	if (i_sep != NULL) *i_sep = is;
	if (str[is] == '\0' || str[is+1] == '\0') { /* No separator => empty right member */
		if (r0 != NULL) *r0 = is;
		if (r1 != NULL) *r1 = is-1;
		if (i_sep != NULL) *i_sep = (str[is] == '\0' ? -1 : is);
		return true;
	}

	/* Parse right part */

	n0 = is + 1;
	if (ignore_spaces) {
		for (; str[n0] && isspace(str[n0]); n0++) ;
		if (ignore_quotes && (str[n0] == '"' || str[n0] == '\'')) quote = str[n0];
	}

	for (n1 = ++n0; str[n1]; n1++) {
		if (ignore_quotes && str[n1] == quote) break; /* Quote was reached */
		if (str[n1] == '\\' && str[++n1] == '\0') break; /* Escape character (can be the last) */
	}
	if (ignore_quotes && str[n1--] != quote) return false; /* Quote is not the same than earlier, '--' is not to take it into account */
	if (!ignore_spaces)
		while (str[++n1]) ; /* Jump down the end of the string */

	if (r0 != NULL) *r0 = n0;
	if (r1 != NULL) *r1 = n1;

	return true;
}

/* --- */

char* html2str(char* html, char* str)
{
	char *ps, *pd;
	int i;

	if (html == NULL) return NULL;

	if (str == NULL) str = html;
	
	/* Look for '&' and matches it to any of the recognized HTML pattern. */
	/* If found, replaces the '&' by the corresponding char. */
	/* 'p2' is the char to analyze, 'p1' is where to insert it */
	for (pd = str, ps = html; *ps; ps++, pd++) {
		if (*ps != '&') {
			if (pd != ps) *pd = *ps;
			continue;
		}
		
		for (i = 0; HTML_SPECIAL_DICT[i].chr; i++) {
			if (strncmp(ps, HTML_SPECIAL_DICT[i].html, HTML_SPECIAL_DICT[i].html_len)) continue;
			
			*pd = HTML_SPECIAL_DICT[i].chr;
			ps += HTML_SPECIAL_DICT[i].html_len-1;
			break;
		}
		/* If no string was found, simply copy the character */
		if (HTML_SPECIAL_DICT[i].chr == '\0' && pd != ps) *pd = *ps;
	}
	*pd = '\0';
	
	return str;
}

char* str2html(char* str, char* html)
{
	char *ps, *pd;
	int i;

	if (str == NULL || html == NULL) return NULL;

	if (html == str) return NULL; /* Not handled yet */

	for (ps = str, pd = html; *ps; ps++, pd++) {
		for (i = 0; HTML_SPECIAL_DICT[i].chr; i++) {
			if (*ps == HTML_SPECIAL_DICT[i].chr) {
				strcpy(pd, HTML_SPECIAL_DICT[i].html);
				pd += HTML_SPECIAL_DICT[i].html_len - 1;
				break;
			}
		}
		if (HTML_SPECIAL_DICT[i].chr == '\0' && pd != ps) *pd = *ps;
	}
	*pd = '\0';

	return str;
}

int strlen_html(char* str)
{
	int i, j, n;
	
	if (str == NULL) return 0;

	n = 0;
	for (i = 0; str[i]; i++) {
		for (j = 0; HTML_SPECIAL_DICT[j].chr; j++) {
			if (str[i] == HTML_SPECIAL_DICT[j].chr) {
				n += HTML_SPECIAL_DICT[j].html_len;
				break;
			}
		}
		if (HTML_SPECIAL_DICT[j].chr == '\0') n++;
	}

	return n;
}

int fprintHTML(FILE* f, char* str)
{
	char* p;
	int i, n;
	
	for (p = str, n = 0; *p; p++) {
		for (i = 0; HTML_SPECIAL_DICT[i].chr; i++) {
			if (*p != HTML_SPECIAL_DICT[i].chr) continue;
			fprintf(f, HTML_SPECIAL_DICT[i].html);
			n += HTML_SPECIAL_DICT[i].html_len;
			break;
		}
		if (HTML_SPECIAL_DICT[i].chr == '\0') {
			(void)fputc(*p, f);
			n++;
		}
	}
	
	return n;
}

int regstrcmp(char* str, char* pattern)
{
	char *p, *s;

	if (str == NULL && pattern == NULL) return true;

	if (str == NULL || pattern == NULL) return false;

	p = pattern;
	s = str;
	while (true) {
		switch (*p) {
			/* Any character matches, go to next one */
			case '?':
				p++;
				s++;
				break;

			/* Go to next character in pattern and wait until it is found in 'str' */
			case '*':
				for (; *p; p++) { /* Squeeze '**?*??**' to '*' */
					if (*p != '*' && *p != '?') break;
				}
				for (; *s; s++) {
					if (*s == *p) break;
				}
				break;

			/* NULL character on pattern has to be matched by 'str' */
			case 0:
				return *s ? false : true;

			default:
				if (*p == '\\') p++; /* Escape character */
				if (*p++ != *s++) return false; /* Characters do not match */
				break;
		}
	}
}
sxmlc Git

Simple, lightweight XML parser in C, statically or dynamically linked.

Branches

Tags

[41d36b]: / src / utils.c Maximize Restore History

416 lines (340 with data), 11.1 kB