The London Perl and Raku Workshop takes place on 26th Oct 2024. If your company depends on Perl, please consider sponsoring and/or attending.
#include "ddl_enc.h"

#include "ppport.h"

#define PERL_NO_GET_CONTEXT

#include "ptable.h"
#include "ddl_buffer.h"

/* General 'config' constants */
#define INITIALIZATION_SIZE 16384
#define MAX_DEPTH 10000

/* three extra for rounding, sign, and end of string */
#define IVUV_MAXCHARS (sizeof (UV) * CHAR_BIT * 28 / 93 + 3)

#define F_UNDEF_BLESSED                 1UL
#define F_DISALLOW_MULTI_OCCURRENCE     2UL
#define F_DUMP_OBJECTS_AS_UNBLESSED     4UL
#define F_DUMP_OBJECTS_AS_BLESSED       8UL

/* some static function declarations */
static void ddl_dump_rv(pTHX_ ddl_encoder_t *enc, SV *src);
static void ddl_dump_av(pTHX_ ddl_encoder_t *enc, AV *src);
static void ddl_dump_hv(pTHX_ ddl_encoder_t *enc, HV *src);
static void ddl_dump_hk(pTHX_ ddl_encoder_t *enc, HE *src);
static void ddl_dump_pv(pTHX_ ddl_encoder_t *enc, const char* src, STRLEN src_len, int is_utf8);

void ddl_destructor_hook(void *p)
{
  ddl_encoder_t *enc = (ddl_encoder_t *)p;
  /* Exception cleanup. Under normal operation, we should have
   * assigned NULL to buf_start after we're done. */
  Safefree(enc->buf_start);
  PTABLE_free(enc->seenhash);
  Safefree(enc);
}

/* Builds the C-level configuration and state struct.
 * Automatically freed at scope boundary. */
ddl_encoder_t *
build_encoder_struct(pTHX_ HV *opt)
{
  ddl_encoder_t *enc;
  SV **svp;

  Newx(enc, 1, ddl_encoder_t);
  /* Register our structure for destruction on scope exit */
  SAVEDESTRUCTOR(&ddl_destructor_hook, (void *)enc);

  /* Init struct */
  Newx(enc->buf_start, INITIALIZATION_SIZE, char);
  enc->buf_end = enc->buf_start + INITIALIZATION_SIZE;
  enc->pos = enc->buf_start;
  enc->depth = 0;
  enc->flags = 0;

  /* TODO: We could do this lazily: Only if there's references with high refcount/weakrefs */
  enc->seenhash = PTABLE_new();

  /* load options */
  if (opt != NULL) {
    if ( (svp = hv_fetchs(opt, "undef_blessed", 0)) && SvTRUE(*svp))
      enc->flags |= F_UNDEF_BLESSED;
    if ( (svp = hv_fetchs(opt, "disallow_multi", 0)) && SvTRUE(*svp))
      enc->flags |= F_DISALLOW_MULTI_OCCURRENCE;
    if ( (svp = hv_fetchs(opt, "objects_as_unblessed", 0)) && SvTRUE(*svp))
      enc->flags |= F_DUMP_OBJECTS_AS_UNBLESSED;
    if ( (svp = hv_fetchs(opt, "dump_objects", 0)) && SvTRUE(*svp))
      enc->flags |= F_DUMP_OBJECTS_AS_BLESSED;
  }
  /* option vlaidation */
  /* FIXME my bit field fu is weak, apparently. Needs replacing with proper idiom */
  if (   (enc->flags & F_UNDEF_BLESSED ? 1 : 0)
       + (enc->flags & F_DUMP_OBJECTS_AS_UNBLESSED ? 1 : 0)
       + (enc->flags & F_DUMP_OBJECTS_AS_BLESSED ? 1 : 0)
       > 1)
  {
    croak("Can only have one of 'undef_blessed', "
          "'objects_as_unblessed', and 'dump_objects' options at a time.");
  }

  return enc;
}



/* Entry point for serialization. Dumps generic SVs and delegates
 * to more specialized functions for RVs, etc. */
void
ddl_dump_sv(pTHX_ ddl_encoder_t *enc, SV *src)
{
  SvGETMAGIC(src);

  /* dump strings */
  if (SvPOKp(src)) {
    STRLEN len;
    char *str = SvPV(src, len);
    BUF_SIZE_ASSERT(enc, 2 + len);
    ddl_dump_pv(aTHX_ enc, str, len, SvUTF8(src));
  }
  /* dump floats */
  else if (SvNOKp(src)) {
    BUF_SIZE_ASSERT(enc, NV_DIG + 32);
    Gconvert(SvNVX(src), NV_DIG, 0, enc->pos);
    enc->pos += strlen(enc->pos);
  }
  /* dump ints */
  else if (SvIOKp(src)) {
    /* we assume we can always read an IV as a UV and vice versa
     * we assume two's complement
     * we assume no aliasing issues in the union */
    if (SvIsUV(src) ? SvUVX(src) <= 59000
                    : SvIVX(src) <= 59000 && SvIVX(src) >= -59000)
    {
      /* optimise the "small number case"
       * code will likely be branchless and use only a single multiplication
       * works for numbers up to 59074 */
      I32 i = SvIVX(src);
      U32 u;
      char digit, nz = 0;

      BUF_SIZE_ASSERT(enc, 6);

      *enc->pos = '-'; enc->pos += i < 0 ? 1 : 0;
      u = i < 0 ? -i : i;

      /* convert to 4.28 fixed-point representation */
      u *= ((0xfffffff + 10000) / 10000); /* 10**5, 5 fractional digits */

      /* now output digit by digit, each time masking out the integer part
       * and multiplying by 5 while moving the decimal point one to the right,
       * resulting in a net multiplication by 10.
       * we always write the digit to memory but conditionally increment
       * the pointer, to enable the use of conditional move instructions. */
      digit = u >> 28; *enc->pos = digit + '0'; enc->pos += (nz = nz || digit); u = (u & 0xfffffffUL) * 5;
      digit = u >> 27; *enc->pos = digit + '0'; enc->pos += (nz = nz || digit); u = (u & 0x7ffffffUL) * 5;
      digit = u >> 26; *enc->pos = digit + '0'; enc->pos += (nz = nz || digit); u = (u & 0x3ffffffUL) * 5;
      digit = u >> 25; *enc->pos = digit + '0'; enc->pos += (nz = nz || digit); u = (u & 0x1ffffffUL) * 5;
      digit = u >> 24; *enc->pos = digit + '0'; enc->pos += 1; /* correctly generate '0' */
    }
    else {
      /* large integer, use the (rather slow) snprintf way. */
      BUF_SIZE_ASSERT(enc, IVUV_MAXCHARS);
      enc->pos +=
         SvIsUV(src)
            ? snprintf(enc->pos, IVUV_MAXCHARS, "%"UVuf, (UV)SvUVX(src))
            : snprintf(enc->pos, IVUV_MAXCHARS, "%"IVdf, (IV)SvIVX(src));
    }
  } /* end is an integer */
  /* undef */
  else if (!SvOK(src)) {
    ddl_buf_cat_str_s(enc, "undef");
  }
  /* dump references */
  else if (SvROK(src))
    ddl_dump_rv(aTHX_ enc, SvRV(src));
  else {
    croak("Attempting to dump unsupported or invalid SV");
  }
}


/* Dump references, delegates to more specialized functions for
 * arrays, hashes, etc. */
static void
ddl_dump_rv(pTHX_ ddl_encoder_t *enc, SV *src)
{
  svtype svt;
  int blessed_object = 0;

  if (++enc->depth > MAX_DEPTH) {
    croak("Reached maximum recursion depth of %u. Aborting", MAX_DEPTH);
  }

  SvGETMAGIC(src);
  svt = SvTYPE(src);

  /* Have to check the seen hash if high refcount or a weak ref */
  if (SvREFCNT(src) > 1 || SvWEAKREF(src)) {
    /* FIXME is the actual sv location the right thing to use? */
    PTABLE_ENTRY_t *entry = PTABLE_find(enc->seenhash, src);
    if (entry != NULL)
      croak("Encountered reference multiple times: '%s'",
            SvPV_nolen(sv_2mortal(newRV_inc(src))));
    else
      PTABLE_store(enc->seenhash, src, NULL);
  }

  if (SvOBJECT(src) && (0 == (enc->flags & F_DUMP_OBJECTS_AS_UNBLESSED))) {
    if (enc->flags & F_UNDEF_BLESSED) {
      ddl_buf_cat_str_s(enc, "undef");
      goto done;
    }
    else if (enc->flags & F_DUMP_OBJECTS_AS_BLESSED) {
      ddl_buf_cat_str_s(enc, "bless(");
      blessed_object = 1;
    }
    else
      croak("Encountered object '%s', but undef_blessed setting is not enabled",
            SvPV_nolen(sv_2mortal(newRV_inc(src))));
  }

  if (svt == SVt_PVHV)
    ddl_dump_hv(aTHX_ enc, (HV *)src);
  else if (svt == SVt_PVAV)
    ddl_dump_av(aTHX_ enc, (AV *)src);
  else if (svt < SVt_PVAV) {
    ddl_buf_cat_char(enc, '\\');
    ddl_dump_sv(aTHX_ enc, src);
  }
  /* else if (enc->json.flags & F_ALLOW_UNKNOWN)
   *  ddl_dump_pv(aTHX_ enc, "null", 4, 0);
   */
  else {
    croak("found %s, but it is not representable by Data::Dumper::Limited serialization",
           SvPV_nolen(sv_2mortal(newRV_inc(src))));
  }

done:
  /* finish writing the bless(XXX,"classname") call */
  if (blessed_object) {
    /* FIXME this should probably do ' escaping! */
    const char *class_name = HvNAME(SvSTASH(src));
    const size_t len = strlen(class_name);
    BUF_SIZE_ASSERT(enc, len + 4);
    ddl_buf_cat_str_s_nocheck(enc, ",");
    ddl_dump_pv(aTHX_ enc,class_name,len,0);
    ddl_buf_cat_str_s_nocheck(enc, ")");
  }

  /* If we DO allow multiple occurrence of the same ref (default), then
   * we need to drop its seenhash entry as soon as it cannot be a cyclic
   * ref any more. */
  if (!(enc->flags & F_DISALLOW_MULTI_OCCURRENCE)) {
    PTABLE_delete(enc->seenhash, src);
  }
}


static void
ddl_dump_av(pTHX_ ddl_encoder_t *enc, AV *src)
{
  UV i, n;
  SV **svp;

  n = av_len(src)+1;
  if (n == 0) {
    ddl_buf_cat_str_s(enc, "[]");
    return;
  }

  ddl_buf_cat_char(enc, '[');

  svp = av_fetch(src, 0, 0);
  if (svp == NULL)
    croak("Got NULL SV from av_fetch");
  ddl_dump_sv(aTHX_ enc, *svp);

  for (i = 1; i < n; ++i) {
    ddl_buf_cat_char(enc, ',');
    svp = av_fetch(src, i, 0);
    if (svp != NULL)
      ddl_dump_sv(aTHX_ enc, *svp);
    else
      ddl_buf_cat_str_s(enc, "undef");
  }

  ddl_buf_cat_char(enc, ']');
}


static void
ddl_dump_hv(pTHX_ ddl_encoder_t *enc, HV *src)
{
  HE *he;
  ddl_buf_cat_char(enc, '{');
  if (hv_iterinit(src) || SvMAGICAL(src)) {
    if ((he = hv_iternext(src))) {
      for (;;) {
        ddl_dump_hk(aTHX_ enc, he);
        ddl_buf_cat_char(enc, ','); /* see comments in ddl_dump_hk */
        ddl_dump_sv(aTHX_ enc, SvMAGICAL(src) ? hv_iterval(src, he) : HeVAL(he));

        if (!(he = hv_iternext(src)))
          break;

        ddl_buf_cat_char(enc, ',');
      }
    }
  }
  ddl_buf_cat_char(enc, '}');
}


static void
ddl_dump_hk(pTHX_ ddl_encoder_t *enc, HE *src)
{
  /* FIXME we could scan the string to see whether we could
   *       skip quoting the string and instead using a fat comma.
   *       But that's a lot of extra coding work, potentially slow,
   *       and a small gain.
   *       Even if that's not done, we can always use the fat comma
   *       for readability. Maybe make that configurable later? */
  if (HeKLEN(src) == HEf_SVKEY) {
    SV *sv = HeSVKEY(src);
    STRLEN len;
    char *str;

    SvGETMAGIC(sv);
    str = SvPV(sv, len);

    ddl_dump_pv(aTHX_ enc, str, len, SvUTF8(sv));
  }
  else {
    ddl_dump_pv(aTHX_ enc, HeKEY(src), HeKLEN(src), HeKUTF8(src));
  }
}

static void
ddl_dump_pv(pTHX_ ddl_encoder_t *enc, const char* src, STRLEN src_len, int is_utf8)
{
    const U8 *scan= (U8*)src;
    const U8 *scan_end= (U8*)src + src_len;
    const U8 *plain_start= 0;
    const U8 *plain_end= 0;
    UV cp;
    STRLEN ulen;
    int has_escapes= 0;
    STRLEN quote_ofs= BUF_POS_OFS(enc);
#define CLEAR_PLAIN_START(enc, plain_start, plain_end)                                          \
    STMT_START {                                                                                \
        if (plain_start) {                                                                      \
            ddl_buf_cat_str(enc, (const char *)plain_start, plain_end - plain_start);     \
            plain_start= plain_end= 0;                                                          \
        }                                                                                       \
    } STMT_END

    BUF_SIZE_ASSERT(enc,src_len);

    ddl_buf_cat_char(enc,'"');
    while (scan < scan_end) {
        cp= *scan;
        switch ((U8)cp) {
        case 0:   /* 0 */
            cp= '0';
            goto simple_esc;
        case '\a': /* 7 */
            cp= 'a';
            goto simple_esc;
        case '\b': /* 8 */
            cp= 'b';
            goto simple_esc;
        case '\t': /* 9 */
            cp= 't';
            goto simple_esc;
        case '\n': /* 10 */
            cp= 'n';
            goto simple_esc;
        case '\f': /* 12 */
            cp= 'f';
            goto simple_esc;
        case '\r': /* 13 */
            cp= 'r';
            goto simple_esc;
        case 27:
            cp= 'e';
            goto simple_esc;
            /* fallthrough */
        case '"':
            /* fallthrough */
        case '\\':
            /* fallthrough */
        case '$':
            /* fallthrough */
        case '@':
            /* fallthrough */
            /* handle simple escapes */
        simple_esc:
            CLEAR_PLAIN_START(enc,plain_start,plain_end);
            BUF_SIZE_ASSERT(enc,2);         /* max size of a special escape including null*/
            *enc->pos++= '\\';
            *enc->pos++= cp;
            scan++;
            has_escapes= 1;
            break;
        case 1:
        case 2:
        case 3:
        case 4:
        case 5:
        case 6:
        case 11:
        case 14:
        case 15:
        case 16:
        case 17:
        case 18:
        case 19:
        case 20:
        case 21:
        case 22:
        case 23:
        case 24:
        case 25:
        case 26:
        case 28:
        case 29:
        case 30:
        case 31:
            goto octal;
            break; /* not reached */
        case ' ':
        case '!':
        case '#':
        case '%':
        case '&':
        case '\'':
        case '(':
        case ')':
        case '*':
        case '+':
        case ',':
        case '-':
        case '.':
        case '/':
        case '0':
        case '1':
        case '2':
        case '3':
        case '4':
        case '5':
        case '6':
        case '7':
        case '8':
        case '9':
        case ':':
        case ';':
        case '<':
        case '=':
        case '>':
        case '?':
        case 'A':
        case 'B':
        case 'C':
        case 'D':
        case 'E':
        case 'F':
        case 'G':
        case 'H':
        case 'I':
        case 'J':
        case 'K':
        case 'L':
        case 'M':
        case 'N':
        case 'O':
        case 'P':
        case 'Q':
        case 'R':
        case 'S':
        case 'T':
        case 'U':
        case 'V':
        case 'W':
        case 'X':
        case 'Y':
        case 'Z':
        case '[':
        case ']':
        case '^':
        case '_':
        case '`':
        case 'a':
        case 'b':
        case 'c':
        case 'd':
        case 'e':
        case 'f':
        case 'g':
        case 'h':
        case 'i':
        case 'j':
        case 'k':
        case 'l':
        case 'm':
        case 'n':
        case 'o':
        case 'p':
        case 'q':
        case 'r':
        case 's':
        case 't':
        case 'u':
        case 'v':
        case 'w':
        case 'x':
        case 'y':
        case 'z':
        case '{':
        case '|':
        case '}':
        case '~':
        case 127:
            if (!plain_start)
                plain_start= scan;
            plain_end= ++scan;
            break;
        default:
            if ( is_utf8 ) {
                has_escapes= 1;
                CLEAR_PLAIN_START(enc,plain_start,plain_end);
                // cp=  Perl_utf8_to_uvchr_buf(aTHX_ scan, scan_end, &ulen);
                cp= Perl_utf8_to_uvchr(aTHX_ (U8 *)scan, &ulen);
                scan += ulen;
                BUF_SIZE_ASSERT(enc,21); /* max size of a hex value of an escape (assume \x{FEDCBA9876543210} is possible) including null*/
                ulen= sprintf(enc->pos,"\\x{%"UVxf"}",cp); /* no need for snprintf here IMO, if the the size assert is right */
                enc->pos += ulen;
            } else {
              octal:
                has_escapes= 1;
                CLEAR_PLAIN_START(enc,plain_start,plain_end);
                scan++;
                BUF_SIZE_ASSERT(enc,5); /* max size of an octal value (\001) including null*/
                if (scan >= scan_end || *scan < '0' || *scan > '7') {
                    ulen= sprintf(enc->pos,"\\%"UVof,cp);
                    enc->pos += ulen;
                } else {
                    ulen= sprintf(enc->pos,"\\%03"UVof,cp);
                    enc->pos += ulen;
                }
            }
        }
    }
    CLEAR_PLAIN_START(enc,plain_start,plain_end);
    if (has_escapes) {
        ddl_buf_cat_char(enc,'"');
    } else {
        ddl_buf_cat_char(enc,'\'');
        enc->buf_start[quote_ofs]='\'';
    }
}