Wednesday, August 28, 2024

Small Fast URL Encoder - vx_urlenc

// SPDX-License-Identifier: Apache-2.0
/***********************************************************
 **********************************************************
 
 * Fast URL Encoder - Closely conforms to RFC 3986
 *
 * Takes piped in uri's similar to jq but is quicker and conforms 
 * more to RFC 3986 [diverges w/most browsers on #'s]. 
 *
 * Handles multibyte unicode characters 
 *  
 * Copyright (C) 2024 Victrixsoft
 *
 * Issues: If you find any issues please open a ticket on github!
 *         https://github.com/victrixsoft/vx_urlenc/issues 
 * 
 * Author: Adam Danischewski <my first nm(dot)my last nm@gmail.com>
 * 
 **********************************************************
 ***********************************************************/

#include <stdio.h>
#include <stdlib.h>
#include <string.h>

#ifdef _WIN32
#include <io.h>
#define access _access
#else
#include <unistd.h>
#endif

#define ASCII_SIZE 128

char *url_encode(const char *);
void init_lookup_table(void);
int fileno(FILE *);
void __attribute__((constructor)) init(void);

char lookup_table[ASCII_SIZE] = {0};

void init_lookup_table(void) {
  for (int i = 'a'; i <= 'z'; i++)
    lookup_table[i] = 1;
  for (int i = 'A'; i <= 'Z'; i++)
    lookup_table[i] = 1;
  for (int i = '0'; i <= '9'; i++)
    lookup_table[i] = 1;
  const char *safe = "-_.:/!~*@$&[]+=,;?";
  while (*safe)
    lookup_table[(unsigned char)*safe++] = 1;
}

void __attribute__((constructor)) init() { init_lookup_table(); }

char *url_encode(const char *str) {
  if (str == NULL)
    return NULL;

  char *encoded =
      malloc(strlen(str) * 4 * 3 + 1); // Worst case: every char 4 byte mb
  if (encoded == NULL)
    return NULL;

  char *penc = encoded;
  size_t output_idx = 0;
  while (*str && *str != '\n') { // Stop at newline
    if (lookup_table[(unsigned char)*str]) {
      *penc++ = *str;
      output_idx++;
    } else {
      sprintf(penc, "%%%02X", (unsigned char)*str);
      penc += 3;
      output_idx += 3;
    }
    str++;
  }
  *penc = '\0';
  char *resized_enc = realloc(encoded, output_idx + 1);
  if (resized_enc == NULL) {
    return encoded;
  } else {
    return resized_enc;
  }
}

int main(int argc, char *argv[]) {
  (void)argc;
  (void)argv;

  if (isatty(fileno(stdin))) {
    perror("It's not a pipe\n");
    return 1;
  }

  char strbuf[65536];
  while (fgets(strbuf, sizeof strbuf, stdin) != NULL) {
    char *encoded = url_encode(strbuf);
    if (encoded) {
      printf("%s\n", encoded);
      free(encoded);
    } else {
      fprintf(stderr, "Error encoding line\n");
    }
  }

  return 0;
}
 Compile: gcc -o vx_urlenc vx_urlenc.c
   Usage: echo 'http://www.gaggle.com/🐀🐁🐂🐃🐄🐅🐆🐇🐈🐉🐊🐋🐌🐍!!![]*?#@.mp4' | ./vx_urlenc
 
https://github.com/victrixsoft/vx_urlenc

Wednesday, August 7, 2024

Unicode Codepoint to Hex - The Bash Way

function cp2hex() { 
 local codepoint="${1#U}"
 local -i bytes=0
 codepoint=${codepoint^^}
 local binary=$(echo "ibase=16;obase=2;${codepoint}"|bc)
 local -a binary_arr
  if ((0x${codepoint} <= 0x007F)); then
  bytes=1
  binary=$(printf "%08d" $binary)
  read -d=' ' -a binary_arr < <(sed 's/./ &/g' <<< "$binary")
  elif ((0x${codepoint} <= 0x07FF)); then
   bytes=2
   binary=$(printf "%011d" $binary)
   read -d=' ' -a binary_arr < <(sed 's/./ &/g' <<< "$binary")
   binary_arr[0]="110${binary_arr[0]}"
   binary_arr[5]="10${binary_arr[5]}"
  elif ((0x${codepoint} <= 0xFFFF)); then
   bytes=3
   binary=$(printf "%016d" $binary)
   read -d=' ' -a binary_arr < <(sed 's/./ &/g' <<< "$binary")
   binary_arr[0]="1110${binary_arr[0]}"
   binary_arr[4]="10${binary_arr[4]}"
   binary_arr[10]="10${binary_arr[10]}"
  elif ((0x${codepoint} <= 0x10FFFF)); then
   bytes=4
   binary=$(printf "%021d" $binary)
   read -d=' ' -a binary_arr < <(sed 's/./ &/g' <<< "$binary")
   binary_arr[0]="11110${binary_arr[0]}"
   binary_arr[3]="10${binary_arr[3]}"
   binary_arr[9]="10${binary_arr[9]}"
   binary_arr[15]="10${binary_arr[15]}"
  fi 
  printf "${bytes} Byte unicode: \U${codepoint}\n"
  echo "Codepoint: U${codepoint}"
  echo "Codepoint binary: $binary"
  binary=$(sed 's/ //g' <<< "${binary_arr[@]}")
  echo "UTF-8 binary: ${binary}"
  echo "UTF-8 hex: $(printf "%X" $((2#${binary})))"
}
$>cp2hex U0024
1 Byte unicode: $
Codepoint: U0024
Codepoint binary: 00100100
UTF-8 binary: 00100100
UTF-8 hex: 24

$>cp2hex U00A2
2 Byte unicode: ¢
Codepoint: U00A2
Codepoint binary: 00010100010
UTF-8 binary: 1100001010100010
UTF-8 hex: C2A2

$>cp2hex 20AC
3 Byte unicode: €
Codepoint: U20AC
Codepoint binary: 0010000010101100
UTF-8 binary: 111000101000001010101100
UTF-8 hex: E282AC

$>cp2hex U1F604
4 Byte unicode: 😄
Codepoint: U1F604
Codepoint binary: 000011111011000000100
UTF-8 binary: 11110000100111111001100010000100
UTF-8 hex: F09F9884