/**
 * 2-letter ISO 639-1 language codes supported by the system.
 */
export const SUPPORTED_LANGUAGES = Object.freeze({
  // Major European languages
  en: 'English', // 322M
  de: 'German', // 121M
  fr: 'French', // 124M
  es: 'Spanish', // 332M
  it: 'Italian', // 63M
  nl: 'Dutch', // 21M
  pl: 'Polish', // 44M
  pt: 'Portuguese', // 182M

  // Nordic languages
  da: 'Danish', // 5M
  fi: 'Finnish', // 6M
  no: 'Norwegian', // 5M (Bokmål and Nynorsk)
  sv: 'Swedish', // 9M
  is: 'Icelandic', // 0.3M

  // Eastern European languages
  ru: 'Russian', // 288M
  uk: 'Ukrainian', // 41M
  cs: 'Czech', // 12M
  sk: 'Slovak', // 6M
  hu: 'Hungarian', // 15M
  bg: 'Bulgarian', // 9M
  ro: 'Romanian', // 26M
  sr: 'Serbian', // 21M (Cyrillic and Latin)
  hr: 'Croatian', // 21M
  mk: 'Macedonian', // 3M
  sl: 'Slovenian', // 2M
  sq: 'Albanian', // 5M

  // Baltic languages
  lt: 'Lithuanian', // 4M
  lv: 'Latvian', // 2M
  et: 'Estonian', // 1M

  // Other European languages
  el: 'Greek', // 12M
  tr: 'Turkish', // 59M
  he: 'Hebrew', // 5M
  ca: 'Catalan', // 4M
  gd: 'Scottish Gaelic', // 0.1M
  cy: 'Welsh', // 0.7M
  ga: 'Irish', // 1.8M
  bs: 'Bosnian', // 21M (Cyrillic and Latin)
  be: 'Belarusian', // 10M
  ka: 'Georgian', // 4M
  hy: 'Armenian', // 7M
} as const);

/**
 * Conversion map of 2-letter ISO 639-1 language
 * codes to 3-letter ISO 639-1 language codes.
 */
export const ISO_639_1_LANGUAGE_CODES = Object.freeze({
  en: 'eng',
  de: 'deu',
  fr: 'fra',
  es: 'spa',
  it: 'ita',
  nl: 'nld',
  pl: 'pol',
  pt: 'por',
  da: 'dan',
  fi: 'fin',
  no: 'nor',
  sv: 'swe',
  is: 'isl',
  ru: 'rus',
  uk: 'ukr',
  cs: 'ces',
  sk: 'slk',
  hu: 'hun',
  bg: 'bul',
  ro: 'rom',
  sr: 'srp',
  hr: 'hrv',
  mk: 'mkd',
  sl: 'slv',
  sq: 'alb',
  lt: 'lit',
  lv: 'lav',
  et: 'est',
  el: 'gre',
  tr: 'tur',
  he: 'heb',
  ca: 'cat',
  gd: 'gla',
  cy: 'wel',
  ga: 'gle',
  bs: 'bos',
  be: 'bel',
  ka: 'geo',
  hy: 'arm',
} as const);

/**
 * ISO 639-1 language codes supported by the system.
 */
export type SupportedLanguageCode = keyof typeof SUPPORTED_LANGUAGES;

/**
 * 3-letter ISO 639-1 language codes supported by the system.
 */
export type SupportedLanguageCode3Letter =
  keyof typeof ISO_639_1_LANGUAGE_CODES;

/**
 * Validates if the given language code is supported
 */
export function isSupportedLanguage(
  language: string | null | undefined,
): language is SupportedLanguageCode {
  return typeof language === 'string' && language in SUPPORTED_LANGUAGES;
}

/**
 * Returns the name of the supported language string based on
 * provided language code. If the language code is not supported,
 * it returns 'czech' as a fallback.
 */
export function getSupportedLanguage(
  languageCode: string | null | undefined,
): string {
  return languageCode && languageCode in SUPPORTED_LANGUAGES
    ? SUPPORTED_LANGUAGES[languageCode as SupportedLanguageCode]
    : 'Czech';
}

/**
 * Detects document language from a representative sample. The output is
 * a 2 letter ISO 639-1 code. The detection uses franc library, which is
 * based on trigrams.
 *
 * https://en.wikipedia.org/wiki/List_of_ISO_639_language_codes
 */
export async function detectLanguage(
  content: string,
  options?: {
    sampleSize?: number;
    minConfidence?: number;
    defaultLanguage?: SupportedLanguageCode;
  },
): Promise<{
  language: string;
  confidence: number;
}> {
  const { francAll } = await import('franc');
  const {
    sampleSize = 1000,
    minConfidence = 0.7,
    defaultLanguage = 'cs',
  } = options ?? {};

  // Get samples from different parts of the document
  const getTextSamples = (text: string): string => {
    const start = text.slice(0, sampleSize);
    const middle = text.slice(
      Math.floor(text.length / 2) - sampleSize / 2,
      Math.floor(text.length / 2) + sampleSize / 2,
    );
    const end = text.slice(-sampleSize);

    return `${start}\n${middle}\n${end}`;
  };

  // Detect language
  const [[language, confidence]] = francAll(getTextSamples(content), {
    minLength: 1,
    only: Object.keys(SUPPORTED_LANGUAGES),
  });

  return {
    language: confidence > minConfidence ? language : defaultLanguage,
    confidence,
  };
}
