/** @prettier */
import { LiteralUnion } from 'type-fest';
import { max, isArray } from 'underscore';

/**
 * This creates a regex pattern to detect if characters in a script belong to
 * a unicode block (e.g. Arabic). The patterns are the equivalent of regex like
 * `/\p{Cyrillic}/`
 */
const createPattern = (string: string) =>
  new RegExp(`\\p{Script_Extensions=${string}}`, 'u');

/**
 * These are the languages we want to scan for. The keys here need to map to
 * Google Fonts "subsets", of which there currently is no reference.
 *
 * createPattern creates a regex pattern for that language (see the function)
 */
export const regexTests = {
  // Order is important when all values are 0, so let's put this here
  latin: {
    tests: createPattern('Latin'),
  },
  arabic: {
    tests: createPattern('Arabic'),
    isRtl: true,
  },
  cyrillic: {
    tests: createPattern('Cyrillic'),
  },
  hebrew: {
    tests: createPattern('Hebrew'),
    isRtl: true,
  },
  thai: {
    tests: createPattern('Thai'),
  },
  japanese: {
    tests: [createPattern('Katakana'), createPattern('Hiragana')],
  },
  'chinese-simplified': {
    tests: [createPattern('Han')],
  },
  korean: {
    tests: [createPattern('Hangul')],
  },
  khmer: {
    tests: [createPattern('Khmer')],
  },
} as const;

export type charsetName = LiteralUnion<keyof typeof regexTests, string>;
export interface CharsetInfo {
  subset: charsetName;
  score: number;
  isRtl: boolean;
}

export const charsetToCharsetInfo = (
  charsetName: charsetName,
): CharsetInfo | null =>
  regexTests[charsetName]
    ? {
        subset: charsetName,
        score: 1,
        isRtl: regexTests[charsetName].isRtl,
      }
    : null;

const splitEveryChars = 100;
const splitEvery = new RegExp('.{1,' + splitEveryChars + '}', 'g');
export const defaultCharsetInfo: CharsetInfo = {
  subset: 'latin',
  score: 0,
  isRtl: false,
};

export function detectCharset(stringToTest): CharsetInfo {
  if (!stringToTest || stringToTest.length === 0) return defaultCharsetInfo;

  const collection = Object.keys(regexTests).reduce<
    { subset: string; score: number; isRtl: boolean }[]
  >((output, key) => {
    const splitString = stringToTest.match(splitEvery) || [];
    const tests = regexTests[key].tests;
    const testsArray = isArray(tests) ? tests : [tests];

    let count = 0;
    testsArray.forEach((test) => {
      // Do I have to test every char? I think not
      splitString.forEach((segment) => {
        // console.log(segment, segment.match(/^[0-9]+$/));
        if (segment.match(/^[0-9]+$/)) return;
        // if (strings.indexOf(char) >= 0) {
        if (segment.match(test)) {
          if (key !== 'latin') count++;
        } else {
          count = count - 0.5;
        }
      });
    });

    output.push({
      subset: key,
      score: count,
      isRtl: !!regexTests[key].isRtl,
    });
    return output;
  }, []);

  return max(collection, (i) => i.score) as any;
}
