#include "debug.h"
#include "integral_types.h"
#include "lang_script.h"
+#include "utf8acceptinterchange.h"
#include "utf8statetable.h"
#ifdef CLD2_DYNAMIC_MODE
extern const CLD2TableSummary kDistinctOcta_obj;
extern const short kAvgDeltaOctaScore[];
+// Returns the length in bytes of the prefix of src that is all
+// interchange valid UTF-8
+int SpanInterchangeValid(const char* src, int byte_length) {
+ int bytes_consumed;
+ const UTF8ReplaceObj* st = &utf8acceptinterchange_obj;
+ StringPiece str(src, byte_length);
+ UTF8GenericScan(st, str, &bytes_consumed);
+ return bytes_consumed;
+}
+
#ifdef CLD2_DYNAMIC_MODE
// CLD2_DYNAMIC_MODE is defined:
// Data will be read from an mmap opened at runtime.
inline bool FlagShort(int flags) {return (flags & kCLDFlagShort) != 0;}
inline bool FlagHint(int flags) {return (flags & kCLDFlagHint) != 0;}
inline bool FlagUseWords(int flags) {return (flags & kCLDFlagUseWords) != 0;}
+inline bool FlagBestEffort(int flags) {
+ return (flags & kCLDFlagBestEffort) != 0;
+}
// Defines Top40 packed languages
// This alternate form overwrites redundant words, thus avoiding corrupting the
-// backmap for generate a vector of original-text ranges.
+// backmap for generating a vector of original-text ranges.
int CheapRepWordsInplaceOverwrite(char* isrc, int src_len, int* hash, int* tbl) {
const uint8* src = reinterpret_cast<const uint8*>(isrc);
const uint8* srclimit = src + src_len;
}
// This alternate form overwrites redundant words, thus avoiding corrupting the
-// backmap for generate a vector of original-text ranges.
+// backmap for generating a vector of original-text ranges.
int CheapSqueezeInplaceOverwrite(char* isrc,
int src_len,
int ichunksize) {
const Language* language3,
const int* percent3,
Language* summary_lang, bool* is_reliable,
- bool FLAGS_cld2_html, bool FLAGS_cld2_quiet) {
+ bool FLAGS_cld2_html, bool FLAGS_cld2_quiet,
+ int flags) {
// Vector of active languages; changes if we delete some
int slot_count = 3;
int active_slot[3] = {0, 1, 2};
for (int i = 0; i < 3; ++i) {
if (language3[i] == TG_UNKNOWN_LANGUAGE) {
ignore_percent += percent3[i];
- // Move the rest up, levaing input vectors unchanged
+ // Move the rest up, leaving input vectors unchanged
for (int j=i+1; j < 3; ++j) {
active_slot[j - 1] = active_slot[j];
}
}
// If return percent is too small (too many languages), return UNKNOWN
- if ((return_percent < kGoodFirstMinPercent)) {
+ if ((return_percent < kGoodFirstMinPercent) && !FlagBestEffort(flags)) {
if (FLAGS_cld2_html && !FLAGS_cld2_quiet) {
fprintf(stderr, "{Unreli %s %d%% percent too small} ",
LanguageCode(*summary_lang), return_percent);
}
}
}
+}
-
-
-
-
+// Extend results to fully cover the [lo..hi) range
+void FinishResultVector(int lo, int hi, ResultChunkVector* vec) {
+ if (vec == NULL) {return;}
+ if (vec->size() == 0) {return;}
+ ResultChunk* rc = &(*vec)[0];
+ if (rc->offset > lo) {
+ int diff = rc->offset - lo;
+ rc->offset -= diff;
+ rc->bytes += diff;
+ }
+ ResultChunk* rc2 = &(*vec)[vec->size() - 1];
+ int rc2hi = rc2->offset + rc2->bytes;
+ if (rc2hi < hi) {
+ int diff = hi - rc2hi;
+ rc2->bytes += diff;
+ }
}
-
// Results language3/percent3/text_bytes must be exactly three items
Language DetectLanguageSummaryV2(
const char* buffer,
// This is the real, non-recursive return
// Move bytes for unreliable langs to another lang or UNKNOWN
- RemoveUnreliableLanguages(&doc_tote, FLAGS_cld2_html, FLAGS_cld2_quiet);
+ if (!FlagBestEffort(flags)) {
+ RemoveUnreliableLanguages(&doc_tote, FLAGS_cld2_html, FLAGS_cld2_quiet);
+ }
// Redo the result extraction after the removal above
doc_tote.Sort(3);
reliable_percent3, language3, percent3, normalized_score3,
text_bytes, is_reliable);
-
-
Language summary_lang;
CalcSummaryLang(&doc_tote, total_text_bytes,
reliable_percent3, language3, percent3,
&summary_lang, is_reliable,
- FLAGS_cld2_html, FLAGS_cld2_quiet);
+ FLAGS_cld2_html, FLAGS_cld2_quiet, flags);
if (FLAGS_cld2_html && !FLAGS_cld2_quiet) {
for (int i = 0; i < 3; ++i) {
fprintf(stderr, "<br>\n");
}
+ // Extend results to fully cover the input buffer
+ FinishResultVector(0, buffer_length, resultchunkvector);
+
return summary_lang;
}