// Copyright 2018 The Chromium Authors. All rights reserved.
// Use of this source code is governed by a BSD-style license that can be
// found in the LICENSE file.

#ifndef UI_ACCESSIBILITY_AX_LANGUAGE_DETECTION_H_
#define UI_ACCESSIBILITY_AX_LANGUAGE_DETECTION_H_

#include <string>
#include <unordered_map>
#include <utility>
#include <vector>

#include "base/macros.h"
#include "third_party/cld_3/src/src/nnet_language_identifier.h"
#include "ui/accessibility/ax_enums.mojom.h"
#include "ui/accessibility/ax_export.h"

namespace ui {

class AXNode;
class AXTree;

// This module implements language detection enabling Chrome to automatically
// detect the language for runs of text within the page.
//
// Node-level language detection runs once per page after the load complete
// event. This involves two passes:
//   *Detect* walks the tree from the given root using cld3 to detect up to 3
//            potential languages per node. A ranked list is created enumerating
//            all potential languages on a page.
//   *Label* re-walks the tree, assigning a language to each node considering
//           the potential languages from the detect phase, page level
//           statistics, and the assigned languages of ancestor nodes.
//
// Optionally an embedder may run *sub-node* language detection which attempts
// to assign languages for runs of text within a node, potentially down to the
// individual character level. This is useful in cases where a single paragraph
// involves switching between multiple languages, and where the speech engine
// doesn't automatically switch voices to handle different character sets.
// Due to the potentially small lengths of text runs involved this tends to be
// lower in accuracy, and works best when a node is composed of multiple
// languages with easily distinguishable scripts.

// AXLanguageInfo represents the local language detection data for all text
// within an AXNode. Stored on AXNode.
struct AX_EXPORT AXLanguageInfo {
  AXLanguageInfo();
  ~AXLanguageInfo();

  // This is the final language we have assigned for this node during the
  // 'label' step, it is the result of merging:
  //  a) The detected language for this node
  //  b) The declared lang attribute on this node
  //  c) the (recursive) language of the parent (detected or declared).
  //
  // This will be the empty string if no language was assigned during label
  // phase.
  //
  // IETF BCP 47 Language code (rfc5646).
  // examples:
  //  'de'
  //  'de-DE'
  //  'en'
  //  'en-US'
  //  'es-ES'
  //
  // This should not be read directly by clients of AXNode, instead clients
  // should call AXNode::GetLanguage().
  std::string language;

  // Detected languages for this node sorted as returned by
  // FindTopNMostFreqLangs, which sorts in decreasing order of probability,
  // filtered to remove any unreliable results.
  std::vector<std::string> detected_languages;
};

// Each AXLanguageSpan contains a language, a probability, and start and end
// indices. The indices are used to specify the substring that contains the
// associated language. The string which the indices are relative to is not
// included in this structure.
// Also, the indices are relative to a Utf8 string.
// See documentation on GetLanguageAnnotationForStringAttribute for details
// on how to associate this object with a string.
struct AX_EXPORT AXLanguageSpan {
  int start_index;
  int end_index;
  std::string language;
  float probability;
};

// A single AXLanguageInfoStats instance is stored on each AXTree and contains
// statistics on detected languages for all the AXNodes in that tree.
//
// We rely on these tree-level statistics when labelling individual nodes, to
// provide extra signals to increase our confidence in assigning a detected
// language.
//
// The Label step will only assign a detected language to a node if that
// language is one of the most frequent languages on the page.
//
// For example, if a single node has detected_languages (in order of probability
// assigned by cld_3): da-DK, en-AU, fr-FR, but the page statistics overall
// indicate that the page is generally in en-AU and ja-JP, it is more likely to
// be a mis-recognition of Danish than an accurate assignment, so we assign
// en-AU instead of da-DK.
class AX_EXPORT AXLanguageInfoStats {
 public:
  AXLanguageInfoStats();
  ~AXLanguageInfoStats();

  // Adjust our statistics to add provided detected languages.
  void Add(const std::vector<std::string>& languages);

  // Fetch the score for a given language.
  int GetScore(const std::string& lang) const;

  // Check if a given language is within the top results.
  bool CheckLanguageWithinTop(const std::string& lang);

 private:
  // Store a count of the occurrences of a given language.
  std::unordered_map<std::string, unsigned int> lang_counts_;

  // Cache of last calculated top language results.
  // A vector of pairs of (score, language) sorted by descending score.
  std::vector<std::pair<unsigned int, std::string>> top_results_;
  // Boolean recording that we have not mutated the statistics since last
  // calculating top results, setting this to false will cause recalculation
  // when the results are next fetched.
  bool top_results_valid_;

  void InvalidateTopResults();

  void GenerateTopResults();

  DISALLOW_COPY_AND_ASSIGN(AXLanguageInfoStats);
};

// AXLanguageDetectionManager manages all of the context needed for language
// detection within an AXTree.
class AX_EXPORT AXLanguageDetectionManager {
 public:
  AXLanguageDetectionManager();
  ~AXLanguageDetectionManager();

  // Detect language for each node in the subtree rooted at the given node.
  // This is the first pass in detection and labelling.
  // This only detects the language, it does not label it, for that see
  //  LabelLanguageForSubtree.
  void DetectLanguageForSubtree(AXNode* subtree_root);

  // Label language for each node in the subtree rooted at the given node.
  // This is the second pass in detection and labelling.
  // This will label the language, but relies on the earlier detection phase
  // having already completed.
  void LabelLanguageForSubtree(AXNode* subtree_root);

  // Sub-node language detection for a given string attribute.
  // For example, if a node has name: "My name is Fred", then calling
  // GetLanguageAnnotationForStringAttribute(*node, ax::mojom::StringAttribute::
  // kName) would return language detection information about "My name is Fred".
  std::vector<AXLanguageSpan> GetLanguageAnnotationForStringAttribute(
      const AXNode& node,
      ax::mojom::StringAttribute attr);

 private:
  // TODO(chrishall): should this be stored by pointer or value?
  AXLanguageInfoStats lang_info_stats;

  void DetectLanguageForSubtreeInternal(AXNode* subtree_root);
  void LabelLanguageForSubtreeInternal(AXNode* subtree_root);

  // This language identifier is constructed with a default minimum byte length
  // of chrome_lang_id::NNetLanguageIdentifier::kMinNumBytesToConsider and is
  // used for detecting page-level languages.
  chrome_lang_id::NNetLanguageIdentifier language_identifier_;

  // This language identifier is constructed with a minimum byte length of
  // kShortTextIdentifierMinByteLength so it can be used for detecting languages
  // of shorter text (e.g. one character).
  chrome_lang_id::NNetLanguageIdentifier short_text_language_identifier_;

  DISALLOW_COPY_AND_ASSIGN(AXLanguageDetectionManager);
};

}  // namespace ui

#endif  // UI_ACCESSIBILITY_AX_LANGUAGE_DETECTION_H_
