November 1, 2007

Convert utf-8 to ANSI (Windows-1252) and back in Visual C++ 6.0 (and 7.0, 8.0)

The Chilkat Charset library provides advanced features for converting text data between various character encodings, and it can be particularly sophisticated with respect to error handling. However, for simple conversions, it doesn’t make sense to add the bloat to your program. Here are two simple C++ functions for converting from a multi-byte code page to Unicode and back.
We’ll then show you how to call them to convert from utf-8 to ANSI (both of which are mulit-byte charsets) and back. (utf-8 is the multi-byte encoding for Unicode)

	
#include <windows.h>
	
// 65001 is utf-8.
wchar_t *CodePageToUnicode(int codePage, const char *src)
    {
    if (!src) return 0;
    int srcLen = strlen(src);
    if (!srcLen)
	{
	wchar_t *w = new wchar_t[1];
	w[0] = 0;
	return w;
	}
	
    int requiredSize = MultiByteToWideChar(codePage,
        0,
        src,srcLen,0,0);
	
    if (!requiredSize)
        {
        return 0;
        }
	
    wchar_t *w = new wchar_t[requiredSize+1];
    w[requiredSize] = 0;
	
    int retval = MultiByteToWideChar(codePage,
        0,
        src,srcLen,w,requiredSize);
    if (!retval)
        {
        delete [] w;
        return 0;
        }
	
    return w;
    }
	
char *UnicodeToCodePage(int codePage, const wchar_t *src)
    {
    if (!src) return 0;
    int srcLen = wcslen(src);
    if (!srcLen)
	{
	char *x = new char[1];
	x[0] = '\0';
	return x;
	}
	
    int requiredSize = WideCharToMultiByte(codePage,
        0,
        src,srcLen,0,0,0,0);
	
    if (!requiredSize)
        {
        return 0;
        }
	
    char *x = new char[requiredSize+1];
    x[requiredSize] = 0;
	
    int retval = WideCharToMultiByte(codePage,
        0,
        src,srcLen,x,requiredSize,0,0);
    if (!retval)
        {
        delete [] x;
        return 0;
        }
	
    return x;
    }

Now to use the methods:

	const char *text = "Sôn bôn de magnà el véder, el me fa minga mal.";
	
	// Convert ANSI (Windows-1252, i.e. CP1252) to utf-8:
	wchar_t *wText = CodePageToUnicode(1252,text);
	
	char *utf8Text = UnicodeToCodePage(65001,wText);
	
	FILE *fp = fopen("utf8File.txt","w");
	fprintf(fp,"%s\n",utf8Text);
	fclose(fp);
	
	// Now convert utf-8 back to ANSI:
	wchar_t *wText2 = CodePageToUnicode(65001,utf8Text);
	
	char *ansiText = UnicodeToCodePage(1252,wText2);
	
	fp = fopen("ansiFile.txt","w");
	fprintf(fp,"%s\n",ansiText);
	fclose(fp);
	
	delete [] ansiText;
	delete [] wText2;
	delete [] wText;
	delete [] utf8Text;

Privacy Statement. Copyright 2000-2011 Chilkat Software, Inc. All rights reserved.
Send feedback to support@chilkatsoft.com
Components for Microsoft Windows XP, 2000, 2003 Server, Vista, Windows 7, and Windows 95/98/NT4.