#include "EXTERN.h" #include "perl.h" #include "XSUB.h" #include "ppport.h" #include "GenerateFunctions.h" SV * GF_escape_html(SV * str, int b_inplace, int b_lftobr, int b_sptonbsp, int b_leaveknown) { int i, maxentitylen = 0; STRLEN origlen, extrachars; char * sp, *newsp, c, lastc; SV * newstr; /* Get string pointer and length (in bytes) */ if (b_inplace) { sp = SvPV_force(str, origlen); } else { sp = SvPV(str, origlen); } /* Calculate extra space required */ extrachars = 0; c = '\0'; for (i = 0; i < origlen; i++) { /* Need to keep track of previous char for ' ' => '  ' expansion */ lastc = c; c = sp[i]; if (c == '<' || c == '>') extrachars += 3; else if (c == '&' && (!b_leaveknown || !GF_is_known_entity(sp, i, origlen, &maxentitylen))) extrachars += 4; else if (c == '"') extrachars += 5; else if (b_lftobr && c == '\n') extrachars += 3; else if (b_sptonbsp && c == ' ' && lastc == ' ') { extrachars += 5; /* don't pick up immediately again */ c = '\0'; } } /* Special single space case */ if (b_sptonbsp && origlen == 1 && sp[0] == ' ') { extrachars += 5; } /* * Include maxentitylen in extrachars. Since in the actual substitution * phase, we work backwards copying characters towards the end of the * string as we go, we might overwrite part of an entity, and then try * and call GF_is_known_entity() on the string, which searches forward, * and then fails because we already overwrote the entity. So we always * make sure we've got maxentitylen extra chars, and then use the perl * OOK hack to offset the start of the string at the end */ if (b_inplace) extrachars += maxentitylen; /* Create new SV, or grow existing SV */ if (b_inplace) { newstr = str; SvGROW(newstr, origlen + extrachars + 1); } else { newstr = newSV(origlen + extrachars + 1); SvPOK_on(newstr); /* Make new string UTF-8 if input string was UTF-8 */ if (SvUTF8(str)) SvUTF8_on(newstr); } /* Set the length of the string */ SvCUR_set(newstr, origlen + extrachars); /* Now do actual replacement (need to work backward for inplace change to work */ /* Original string might have moved due to grow */ sp = SvPV_nolen(str); /* Null terminate new string */ newsp = SvPV_nolen(newstr) + origlen + extrachars; *newsp = '\0'; c = '\0'; for (i = origlen-1; i >= 0; i--) { lastc = c; c = sp[i]; if (c == '<') { newsp -= 4; memcpy(newsp, "<", 4); } else if (c == '>') { newsp -= 4; memcpy(newsp, ">", 4); } else if (c == '&' && (!b_leaveknown || !GF_is_known_entity(sp, i, origlen, 0))) { newsp -= 5; memcpy(newsp, "&", 5); } else if (c == '"') { newsp -= 6; memcpy(newsp, """, 6); } else if (b_lftobr && c == '\n') { newsp -= 4; memcpy(newsp, "
", 4); } else if (b_sptonbsp && c == ' ' && lastc == ' ') { newsp -= 6; memcpy(newsp, "  ", 7); /* don't pick up immediately again */ c = '\0'; } else *--newsp = c; } /* Special single space case */ if (b_sptonbsp && origlen == 1 && sp[0] == ' ') { newsp -= 5; memcpy(newsp, " ", 6); } if (b_inplace && maxentitylen) sv_chop(newstr, newsp); if (SvPV_nolen(newstr) != newsp) { croak("Unexpected length mismatch"); return 0; } return newstr; } SV * GF_generate_attributes(HV * attrhv) { int i, j, estimatedlen = 1; I32 keylen; char * key, tmp[64]; SV * attrstr, * val; /* Iterate through keys to work out an estimated final length */ while ((val = hv_iternextsv(attrhv, &key, &keylen))) { estimatedlen += keylen + 1; estimatedlen += GF_estimate_attribute_value_len(val) + 3; } /* warn("estimated len: %d", estimatedlen); */ attrstr = newSV(estimatedlen); SvPOK_on(attrstr); /* Now iteratre and build actual string */ hv_iterinit(attrhv); while ((val = hv_iternextsv(attrhv, &key, &keylen))) { /* Add space to string if already something in it */ if (SvCUR(attrstr)) sv_catpvn(attrstr, " ", 1); /* For key, convert to lower case and add to attrstr */ if (keylen < 64) { /* If key starts with - (eg -width => '10%'), skip - */ j = 0; i = (keylen && key[0] == '-' ? 1 : 0); for (; i < keylen; i++) tmp[j++] = toLOWER(key[i]); sv_catpvn(attrstr, tmp, j); } else { sv_catpvn(attrstr, key, keylen); } /* Add '="value"' part if present*/ if (SvOK(val)) { sv_catpvn(attrstr, "=\"", 2); GF_generate_attribute_value(attrstr, val); sv_catpvn(attrstr, "\"", 1); } } /* warn("real len: %d, %s", SvCUR(attrstr), SvPV_nolen(attrstr)); */ return attrstr; } SV * GF_generate_tag(SV * tag, HV * attrhv, SV * val, int b_escapeval, int b_addnewline, int b_closetag) { char * tagsp, * valsp; STRLEN taglen, vallen, estimatedlen; SV * tagstr, * attrstr = 0; /* Force tag to string when getting length */ tagsp = SvPV(tag, taglen); estimatedlen = taglen + 3 + (b_addnewline ? 1 : 0); /* Create attributes as string */ if (attrhv) { attrstr = GF_generate_attributes(attrhv); estimatedlen += SvCUR(attrstr) + 1; } if (val) { /* If asked to escape, escape the val */ if (b_escapeval) val = GF_escape_html(val, 0, 0, 0, 0); /* Force value to string when getting length */ valsp = SvPV(val, vallen); estimatedlen += vallen + taglen + 3; } /* If asked to close the tag, add ' /' */ if (b_closetag) estimatedlen += 2; /* Create new string to put final result in */ tagstr = newSV(estimatedlen); SvPOK_on(tagstr); sv_catpvn(tagstr, "<", 1); sv_catsv(tagstr, tag); if (attrstr) { sv_catpvn(tagstr, " ", 1); sv_catsv(tagstr, attrstr); SvREFCNT_dec(attrstr); } if (b_closetag) sv_catpvn(tagstr, " />", 3); else sv_catpvn(tagstr, ">", 1); if (val) { sv_catsv(tagstr, val); if (b_escapeval) SvREFCNT_dec(val); sv_catpvn(tagstr, "", 1); } if (b_addnewline) sv_catpvn(tagstr, "\n", 1); return tagstr; } static char * hexlookup = "0123456789ABCDEF"; SV * GF_escape_uri(SV * str, SV * escchars, int b_inplace) { int i; STRLEN origlen, esclen, extrachars; char * sp, *newsp, *escsp; unsigned char c; SV * newstr; /* Get string pointer and length (in bytes) */ if (b_inplace) { sp = SvPV_force(str, origlen); } else { sp = SvPV(str, origlen); } escsp = SvPV(escchars, esclen); /* Calculate extra space required */ extrachars = 0; for (i = 0; i < origlen; i++) { c = (unsigned char)sp[i]; /* Always escape control on 8-bit chars or chars in our escape set */ if (c <= 0x20 || c >= 0x80 || memchr(escsp, c, esclen)) { extrachars += 2; } } /* Create new SV, or grow existing SV */ if (b_inplace) { newstr = str; /* Always turn of utf8-ness in escaped string */ SvUTF8_off(newstr); SvGROW(newstr, origlen + extrachars + 1); } else { newstr = newSV(origlen + extrachars + 1); SvPOK_on(newstr); } /* Set the length of the string */ SvCUR_set(newstr, origlen + extrachars); /* Now do actual replacement (need to work backward for inplace change to work */ /* Original string might have moved due to grow */ sp = SvPV_nolen(str); /* Null terminate new string */ newsp = SvPV_nolen(newstr) + origlen + extrachars; *newsp = '\0'; for (i = origlen-1; i >= 0; i--) { c = (unsigned char)sp[i]; if (c <= 0x20 || c >= 0x80 || memchr(escsp, c, esclen)) { newsp -= 3; newsp[0] = '%'; newsp[1] = hexlookup[(c>>4) & 0x0f]; newsp[2] = hexlookup[c & 0x0f]; } else *--newsp = (char)c; } if (newsp != SvPV_nolen(newstr)) { croak("Unexpected length mismatch"); return 0; } return newstr; } int GF_is_known_entity(char * sp, int i, int origlen, int *maxlen) { int start = i; if (++i < origlen) { /* Check for unicode ref (eg Ӓ) */ if (sp[i] == '#') { int is_hex = 0; /* Check for hex unicode ref (eg ኯ) */ if (i+1 < origlen && (sp[i+1] == 'x' || sp[i+1] == 'X')) { is_hex = 1; i++; } /* Not quite right, says "&#" and "&#;" are ok */ while (++i < origlen) { if (sp[i] >= '0' && sp[i] <= '9') continue; if (is_hex && ((sp[i] >= 'a' && sp[i] <= 'f') || (sp[i] >= 'A' && sp[i] <= 'F'))) continue; if (sp[i] == ';' || sp[i] == ' ') { /* Keep track of maximum entity length */ i++; if (maxlen && (i - start > *maxlen)) *maxlen = i-start; return 1; } break; } /* Check for entity ref (eg  ) */ } else if ((sp[i] >= 'a' && sp[i] <= 'z') || (sp[i] >= 'A' && sp[i] <= 'Z')) { while (++i < origlen) { if ((sp[i] >= 'a' && sp[i] <= 'z') || (sp[i] >= 'A' && sp[i] <= 'Z')) continue; /* We should check to see if matched text string is known enity, but it's not that important */ if (sp[i] == ';' || sp[i] == ' ') { /* Keep track of maximum entity length */ i++; if (maxlen && (i - start > *maxlen)) *maxlen = i-start; return 1; } break; } } } return 0; } int GF_estimate_attribute_value_len(SV * val) { STRLEN vallen; I32 valtype; /* If reference, de-reference ... */ if (SvROK(val)) { val = SvRV(val); } valtype = SvTYPE(val); /* Array case */ if (valtype == SVt_PVAV) { int estimatedlen = 0; AV * aval = (AV *)val; I32 alen = av_len(aval), i; for (i = 0; i <= alen; i++) { SV **av_val; if ((av_val = av_fetch(aval, i, 0)) && SvOK(val = *av_val)) { estimatedlen += GF_estimate_attribute_value_len(val) + 1; } } return estimatedlen; } /* Hash case */ if (valtype == SVt_PVHV) { int estimatedlen = 0; HV * hval = (HV *)val; char * key; I32 keylen; hv_iterinit(hval); while ((val = hv_iternextsv(hval, &key, &keylen))) { estimatedlen += keylen + 1; } return estimatedlen; } /* Ignore other non-scalar types */ if (!SvOK(val)) return 0; /* Most common case of a string */ if (SvPOK(val)) return SvCUR(val); /* Other SV case, turn it into a string */ if (SvOK(val)) return (SvPV(val, vallen), vallen); return 0; } void GF_generate_attribute_value(SV * attrstr, SV * val) { I32 valtype; int no_escape = 0; /* If reference, de-reference ... */ if (SvROK(val)) { val = SvRV(val); no_escape = 1; } valtype = SvTYPE(val); /* Array? Iterate over array items space separated... */ if (valtype == SVt_PVAV) { AV * aval = (AV *)val; I32 alen = av_len(aval), i; for (i = 0; i <= alen; i++) { SV **av_val; if ((av_val = av_fetch(aval, i, 0)) && SvOK(val = *av_val)) { GF_generate_attribute_value(attrstr, val); if (i != alen) sv_catpvn(attrstr, " ", 1); } } return; } /* Hash? Iterate over keys space separated... */ if (valtype == SVt_PVHV) { HV * hval = (HV *)val; char * key; I32 keylen; I32 hlen = hv_iterinit(hval), i = 0; HE * hentry; while ((hentry = hv_iternext(hval))) { key = hv_iterkey(hentry, &keylen); sv_catpvn(attrstr, key, keylen); if (++i != hlen) sv_catpvn(attrstr, " ", 1); } return; } /* Ignore other non-scalar types */ if (!SvOK(val)) return; /* Otherwise just append to attribute string */ /* If value was reference, use that unescaped */ if (no_escape) { sv_catsv(attrstr, val); /* For the value part, escape special html chars, then dispose of result */ } else { val = GF_escape_html(val, 0, 0, 0, 0); sv_catsv(attrstr, val); SvREFCNT_dec(val); } return; }