File: utf8.c

package info (click to toggle)
irssi-text 0.8.4-3.1
links: PTS
area: main
in suites: woody
size: 6,048 kB
ctags: 5,670
sloc: ansic: 51,220; sh: 8,913; perl: 1,076; makefile: 1,017
file content (177 lines) | stat: -rw-r--r-- 4,138 bytes
/* utf8.c - Operations on UTF-8 strings.
 *
 * Copyright (C) 2002 Timo Sirainen
 *
 * Based on GLib code by
 *
 * Copyright (C) 1999 Tom Tromey
 * Copyright (C) 2000 Red Hat, Inc.
 *
 * This library is free software; you can redistribute it and/or
 * modify it under the terms of the GNU Lesser General Public
 * License as published by the Free Software Foundation; either
 * version 2 of the License, or (at your option) any later version.
 *
 * This library is distributed in the hope that it will be useful,
 * but WITHOUT ANY WARRANTY; without even the implied warranty of
 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.	 See the GNU
 * Lesser General Public License for more details.
 *
 * You should have received a copy of the GNU Lesser General Public
 * License along with this library; if not, write to the
 * Free Software Foundation, Inc., 59 Temple Place - Suite 330,
 * Boston, MA 02111-1307, USA.
 */

#include "module.h"

#define UTF8_COMPUTE(Char, Mask, Len)					      \
  if (Char < 128)							      \
    {									      \
      Len = 1;								      \
      Mask = 0x7f;							      \
    }									      \
  else if ((Char & 0xe0) == 0xc0)					      \
    {									      \
      Len = 2;								      \
      Mask = 0x1f;							      \
    }									      \
  else if ((Char & 0xf0) == 0xe0)					      \
    {									      \
      Len = 3;								      \
      Mask = 0x0f;							      \
    }									      \
  else if ((Char & 0xf8) == 0xf0)					      \
    {									      \
      Len = 4;								      \
      Mask = 0x07;							      \
    }									      \
  else if ((Char & 0xfc) == 0xf8)					      \
    {									      \
      Len = 5;								      \
      Mask = 0x03;							      \
    }									      \
  else if ((Char & 0xfe) == 0xfc)					      \
    {									      \
      Len = 6;								      \
      Mask = 0x01;							      \
    }									      \
  else									      \
    Len = -1;

#define UTF8_GET(Result, Chars, Count, Mask, Len)			      \
  (Result) = (Chars)[0] & (Mask);					      \
  for ((Count) = 1; (Count) < (Len); ++(Count))				      \
    {									      \
      if (((Chars)[(Count)] & 0xc0) != 0x80)				      \
	{								      \
	  (Result) = -1;						      \
	  break;							      \
	}								      \
      (Result) <<= 6;							      \
      (Result) |= ((Chars)[(Count)] & 0x3f);				      \
    }

unichar get_utf8_char(const unsigned char **ptr, int len)
{
	int i, result, mask, chrlen;

        mask = 0;
	UTF8_COMPUTE(**ptr, mask, chrlen);
	if (chrlen == -1)
		return (unichar) -2;

	if (chrlen > len)
                return (unichar) -1;

	UTF8_GET(result, *ptr, i, mask, chrlen);
	if (result == -1)
                return (unichar) -2;

	*ptr += chrlen-1;
        return result;
}

int strlen_utf8(const char *str)
{
	const unsigned char *p = (const unsigned char *) str;
        int len;

	len = 0;
	while (*p != '\0' && get_utf8_char(&p, 6) > 0) {
		len++;
                p++;
	}
        return len;
}

int utf16_char_to_utf8(unichar c, char *outbuf)
{
	int len, i, first;

        len = 0;
	if (c < 0x80) {
		first = 0;
		len = 1;
	} else if (c < 0x800) {
		first = 0xc0;
		len = 2;
	} else if (c < 0x10000) {
		first = 0xe0;
		len = 3;
	} else if (c < 0x200000) {
		first = 0xf0;
		len = 4;
	} else if (c < 0x4000000) {
		first = 0xf8;
		len = 5;
	} else {
		first = 0xfc;
		len = 6;
	}

	if (outbuf) {
		for (i = len - 1; i > 0; --i) {
			outbuf[i] = (c & 0x3f) | 0x80;
			c >>= 6;
		}
		outbuf[0] = c | first;
	}

	return len;
}

void utf8_to_utf16(const char *str, unichar *out)
{
	const unsigned char *p = (const unsigned char *) str;
        int i, result, mask, len;

	while (*p != '\0') {
                mask = 0;
		UTF8_COMPUTE(*p, mask, len);
		if (len == -1)
                        break;

		UTF8_GET(result, p, i, mask, len);
		if (result == -1)
                        break;

                p += len;
                *out++ = result;
	}

	*out = '\0';
}

void utf16_to_utf8(const unichar *str, char *out)
{
	int len;

	while (*str != '\0') {
		len = utf16_char_to_utf8(*str, out);
                out += len;

		str++;
	}
	*out = '\0';
}