Wednesday, August 7, 2024

Unicode Codepoint to Hex - The Bash Way

function cp2hex() { 
 local codepoint="${1#U}"
 local -i bytes=0
 codepoint=${codepoint^^}
 local binary=$(echo "ibase=16;obase=2;${codepoint}"|bc)
 local -a binary_arr
  if ((0x${codepoint} <= 0x007F)); then
  bytes=1
  binary=$(printf "%08d" $binary)
  read -d=' ' -a binary_arr < <(sed 's/./ &/g' <<< "$binary")
  elif ((0x${codepoint} <= 0x07FF)); then
   bytes=2
   binary=$(printf "%011d" $binary)
   read -d=' ' -a binary_arr < <(sed 's/./ &/g' <<< "$binary")
   binary_arr[0]="110${binary_arr[0]}"
   binary_arr[5]="10${binary_arr[5]}"
  elif ((0x${codepoint} <= 0xFFFF)); then
   bytes=3
   binary=$(printf "%016d" $binary)
   read -d=' ' -a binary_arr < <(sed 's/./ &/g' <<< "$binary")
   binary_arr[0]="1110${binary_arr[0]}"
   binary_arr[4]="10${binary_arr[4]}"
   binary_arr[10]="10${binary_arr[10]}"
  elif ((0x${codepoint} <= 0x10FFFF)); then
   bytes=4
   binary=$(printf "%021d" $binary)
   read -d=' ' -a binary_arr < <(sed 's/./ &/g' <<< "$binary")
   binary_arr[0]="11110${binary_arr[0]}"
   binary_arr[3]="10${binary_arr[3]}"
   binary_arr[9]="10${binary_arr[9]}"
   binary_arr[15]="10${binary_arr[15]}"
  fi 
  printf "${bytes} Byte unicode: \U${codepoint}\n"
  echo "Codepoint: U${codepoint}"
  echo "Codepoint binary: $binary"
  binary=$(sed 's/ //g' <<< "${binary_arr[@]}")
  echo "UTF-8 binary: ${binary}"
  echo "UTF-8 hex: $(printf "%X" $((2#${binary})))"
}
$>cp2hex U0024
1 Byte unicode: $
Codepoint: U0024
Codepoint binary: 00100100
UTF-8 binary: 00100100
UTF-8 hex: 24

$>cp2hex U00A2
2 Byte unicode: ¢
Codepoint: U00A2
Codepoint binary: 00010100010
UTF-8 binary: 1100001010100010
UTF-8 hex: C2A2

$>cp2hex 20AC
3 Byte unicode: €
Codepoint: U20AC
Codepoint binary: 0010000010101100
UTF-8 binary: 111000101000001010101100
UTF-8 hex: E282AC

$>cp2hex U1F604
4 Byte unicode: 😄
Codepoint: U1F604
Codepoint binary: 000011111011000000100
UTF-8 binary: 11110000100111111001100010000100
UTF-8 hex: F09F9884

No comments:

Post a Comment