How to decode HTML Entities in C?
I'm interested in unescaping text for example: \
maps to \
in C. Does anyone know of a good library?
As reference开发者_StackOverflow社区 the Wikipedia List of XML and HTML Character Entity References.
For another open source reference in C to decoding these HTML entities you can check out the command line utility uni2ascii/ascii2uni. The relevant files are enttbl.{c,h} for entity lookup and putu8.c which down converts from UTF32 to UTF8.
uni2ascii
I wrote my own unescape code; very simplified, but does the job: pn_util.c
Function Description: Convert special HTML entities back to characters. Need to do some modifications to fit your requirement.
char* HtmlSpecialChars_Decode(char* encodedHtmlSpecialEntities)
{
int encodedLen = 0;
int escapeArrayLen = 0;
static char decodedHtmlSpecialChars[TITLE_SIZE];
char innerHtmlSpecialEntities[MAX_CONFIG_ITEM_SIZE];
/* This mapping table can be extended if necessary. */
static const struct {
const char* encodedEntity;
const char decodedChar;
} entityToChars[] = {
{"<", '<'},
{">", '>'},
{"&", '&'},
{""", '"'},
{"'", '\''},
};
if(strchr(encodedHtmlSpecialEntities, '&') == NULL)
return encodedHtmlSpecialEntities;
memset(decodedHtmlSpecialChars, '\0', TITLE_SIZE);
memset(innerHtmlSpecialEntities, '\0', MAX_CONFIG_ITEM_SIZE);
escapeArrayLen = sizeof(entityToChars) / sizeof(entityToChars[0]);
strcpy(innerHtmlSpecialEntities, encodedHtmlSpecialEntities);
encodedLen = strlen(innerHtmlSpecialEntities);
for(int i = 0; i < encodedLen; i++)
{
if(innerHtmlSpecialEntities[i] == '&')
{
/* Potential encode char. */
char * tempEntities = innerHtmlSpecialEntities + i;
for(int j = 0; j < escapeArrayLen; j++)
{
if(strncmp(tempEntities, entityToChars[j].encodedEntity, strlen(entityToChars[j].encodedEntity)) == 0)
{
int index = 0;
strncat(decodedHtmlSpecialChars, innerHtmlSpecialEntities, i);
index = strlen(decodedHtmlSpecialChars);
decodedHtmlSpecialChars[index] = entityToChars[j].decodedChar;
if(strlen(tempEntities) > strlen(entityToChars[j].encodedEntity))
{
/* Not to the end, continue */
char temp[MAX_CONFIG_ITEM_SIZE] = {'\0'};
strcpy(temp, tempEntities + strlen(entityToChars[j].encodedEntity));
memset(innerHtmlSpecialEntities, '\0', MAX_CONFIG_ITEM_SIZE);
strcpy(innerHtmlSpecialEntities, temp);
encodedLen = strlen(innerHtmlSpecialEntities);
i = -1;
}
else
encodedLen = 0;
break;
}
}
}
}
if(encodedLen != 0)
strcat(decodedHtmlSpecialChars, innerHtmlSpecialEntities);
return decodedHtmlSpecialChars;
}
QString UNESC(const QString &txt) {
QStringList bld;
static QChar AMP = '&', SCL = ';';
static QMap<QString, QString> dec = {
{"<", "<"}, {">", ">"}
, {"&", "&"}, {""", R"(")"}, {"'", "'"} };
if(!txt.contains(AMP)) { return txt; }
int bgn = 0, pos = 0;
while((pos = txt.indexOf(AMP, pos)) != -1) {
int end = txt.indexOf(SCL, pos)+1;
QString val = dec[txt.mid(pos, end - pos)];
bld << txt.mid(bgn, pos - bgn);
if(val.isEmpty()) {
end = txt.indexOf(AMP, pos+1);
bld << txt.mid(pos, end - pos);
} else {
bld << val;
}// else // if(val.isEmpty())
bgn = end; pos = end;
}// while((pos = txt.indexOf(AMP, pos)) != -1)
return bld.join(QString());
}// UNESC
精彩评论