export const SUPPORTED_LANGUAGES = Object.freeze({
  // Major European languages
  eng: 'English', // 322M
  deu: 'German', // 121M
  fra: 'French', // 124M
  spa: 'Spanish', // 332M
  ita: 'Italian', // 63M
  nld: 'Dutch', // 21M
  pol: 'Polish', // 44M
  por: 'Portuguese', // 182M

  // Nordic languages
  dan: 'Danish', // 5M
  fin: 'Finnish', // 6M
  nor: 'Norwegian', // 5M (Bokmål and Nynorsk)
  swe: 'Swedish', // 9M
  isl: 'Icelandic', // 0.3M

  // Eastern European languages
  rus: 'Russian', // 288M
  ukr: 'Ukrainian', // 41M
  ces: 'Czech', // 12M
  slk: 'Slovak', // 6M
  hun: 'Hungarian', // 15M
  bul: 'Bulgarian', // 9M
  rom: 'Romanian', // 26M
  srp: 'Serbian', // 21M (Cyrillic and Latin)
  hrv: 'Croatian', // 21M
  mkd: 'Macedonian', // 3M
  slv: 'Slovenian', // 2M
  alb: 'Albanian', // 5M

  // Baltic languages
  lit: 'Lithuanian', // 4M
  lav: 'Latvian', // 2M
  est: 'Estonian', // 1M

  // Other European languages
  gre: 'Greek', // 12M
  tur: 'Turkish', // 59M
  heb: 'Hebrew', // 5M
  cat: 'Catalan', // 4M
  gla: 'Scottish Gaelic', // 0.1M
  wel: 'Welsh', // 0.7M
  gle: 'Irish', // 1.8M
  bos: 'Bosnian', // 21M (Cyrillic and Latin)
  bel: 'Belarusian', // 10M
  geo: 'Georgian', // 4M
  arm: 'Armenian', // 7M
} as const);

/**
 * ISO 639-1 language codes supported by the system.
 */
export type SupportedLanguageCode = keyof typeof SUPPORTED_LANGUAGES;

/**
 * Validates if the given language code is supported
 */
export function isSupportedLanguage(
  language: string | null | undefined,
): language is SupportedLanguageCode {
  return typeof language === 'string' && language in SUPPORTED_LANGUAGES;
}

/**
 * Returns the name of the supported language string based on
 * provided language code. If the language code is not supported,
 * it returns 'czech' as a fallback.
 */
export function getSupportedLanguage(
  languageCode: string | null | undefined,
): string {
  return languageCode && languageCode in SUPPORTED_LANGUAGES
    ? SUPPORTED_LANGUAGES[languageCode as SupportedLanguageCode]
    : 'czech';
}

/**
 * Detects document language from a representative sample. The output is
 * a 3 letter ISO 639-1 code. The detection uses franc library, which is
 * based on trigrams.
 *
 * https://en.wikipedia.org/wiki/List_of_ISO_639_language_codes
 */
export async function detectLanguage(
  content: string,
  options?: {
    sampleSize?: number;
    minConfidence?: number;
    defaultLanguage?: string;
  },
): Promise<{
  language: string;
  confidence: number;
}> {
  const { francAll } = await import('franc');
  const {
    sampleSize = 1000,
    minConfidence = 0.7,
    defaultLanguage = 'ces',
  } = options ?? {};

  // Get samples from different parts of the document
  const getTextSamples = (text: string): string => {
    const start = text.slice(0, sampleSize);
    const middle = text.slice(
      Math.floor(text.length / 2) - sampleSize / 2,
      Math.floor(text.length / 2) + sampleSize / 2,
    );
    const end = text.slice(-sampleSize);

    return `${start}\n${middle}\n${end}`;
  };

  // Detect language
  const [[language, confidence]] = francAll(getTextSamples(content), {
    minLength: 1,
    only: Object.keys(SUPPORTED_LANGUAGES),
  });

  return {
    language: confidence > minConfidence ? language : defaultLanguage,
    confidence,
  };
}
