From e208465b3f8e6cffb9dfca62142f73c6ba166957 Mon Sep 17 00:00:00 2001 From: Victor Cebollada Date: Fri, 12 Feb 2016 08:45:08 +0000 Subject: [PATCH 1/1] TextModel - Create the text segmentation info for a given range of characters inside a text. Change-Id: Ia00c9fa34aab4a7c8aedd1168dcc41740f30311a Signed-off-by: Victor Cebollada --- .../utc-Dali-Text-CharacterSetConversion.cpp | 62 +++++ .../utc-Dali-Text-Segmentation.cpp | 276 +++++++++++++++++---- dali-toolkit/internal/text/segmentation.cpp | 44 +++- dali-toolkit/internal/text/segmentation.h | 4 + .../internal/text/text-controller-impl.cpp | 4 + 5 files changed, 331 insertions(+), 59 deletions(-) diff --git a/automated-tests/src/dali-toolkit-internal/utc-Dali-Text-CharacterSetConversion.cpp b/automated-tests/src/dali-toolkit-internal/utc-Dali-Text-CharacterSetConversion.cpp index fd77e0d..2ec5233 100644 --- a/automated-tests/src/dali-toolkit-internal/utc-Dali-Text-CharacterSetConversion.cpp +++ b/automated-tests/src/dali-toolkit-internal/utc-Dali-Text-CharacterSetConversion.cpp @@ -30,6 +30,7 @@ using namespace Text; // Tests the following functions for scripts with different number of bytes per character. // Latin 1 byte per character, Arabic 2 bytes per character, Devanagari 3 bytes per character and emojis 4 bytes per character. // +// uint8_t GetUtf8Length( uint8_t utf8LeadByte ); // uint32_t GetNumberOfUtf8Characters( const uint8_t* const utf8, uint32_t length ); // uint32_t GetNumberOfUtf8Bytes( const uint32_t* const utf32, uint32_t numberOfCharacters ); // uint32_t Utf8ToUtf32( const uint8_t* const utf8, uint32_t length, uint32_t* utf32 ); @@ -122,6 +123,67 @@ bool Utf32ToUtf8Test( const Utf32ToUtf8Data& data ) ////////////////////////////////////////////////////////// +int UtcDaliTextCharacterSetConversionGetUtf8Length(void) +{ + ToolkitTestApplication application; + tet_infoline(" UtcDaliTextCharacterSetConversionGetUtf8Length"); + + // Copy of the table used to get the size in bytes of a character encoded with utf8. + // If the table used by the GetUtf8Length() function is updated, this one needs to be updated as well. + const static uint8_t U1 = 1u; + const static uint8_t U2 = 2u; + const static uint8_t U3 = 3u; + const static uint8_t U4 = 4u; + const static uint8_t U0 = 0u; + const static uint8_t UTF8_LENGTH[256] = { + U1, U1, U1, U1, U1, U1, U1, U1, U1, U1, // + U1, U1, U1, U1, U1, U1, U1, U1, U1, U1, // + U1, U1, U1, U1, U1, U1, U1, U1, U1, U1, // + U1, U1, U1, U1, U1, U1, U1, U1, U1, U1, // + U1, U1, U1, U1, U1, U1, U1, U1, U1, U1, // + U1, U1, U1, U1, U1, U1, U1, U1, U1, U1, // + U1, U1, U1, U1, U1, U1, U1, U1, U1, U1, // + U1, U1, U1, U1, U1, U1, U1, U1, U1, U1, // + U1, U1, U1, U1, U1, U1, U1, U1, U1, U1, // lead byte = 0xxx xxxx (U+0000 - U+007F + some extended ascii characters) + U1, U1, U1, U1, U1, U1, U1, U1, U1, U1, // + U1, U1, U1, U1, U1, U1, U1, U1, U1, U1, // + U1, U1, U1, U1, U1, U1, U1, U1, U1, U1, // + U1, U1, U1, U1, U1, U1, U1, U1, U1, U1, // + U1, U1, U1, U1, U1, U1, U1, U1, U1, U1, // + U1, U1, U1, U1, U1, U1, U1, U1, U1, U1, // + U1, U1, U1, U1, U1, U1, U1, U1, U1, U1, // + U1, U1, U1, U1, U1, U1, U1, U1, U1, U1, // + U1, U1, U1, U1, U1, U1, U1, U1, U1, U1, // + U1, U1, U1, U1, U1, U1, U1, U1, U1, U1, // + U1, U1, // + + U2, U2, U2, U2, U2, U2, U2, U2, U2, U2, // + U2, U2, U2, U2, U2, U2, U2, U2, U2, U2, // lead byte = 110x xxxx (U+0080 - U+07FF) + U2, U2, U2, U2, U2, U2, U2, U2, U2, U2, // + U2, U2, // + + U3, U3, U3, U3, U3, U3, U3, U3, U3, U3, // lead byte = 1110 xxxx (U+0800 - U+FFFF) + U3, U3, U3, U3, U3, U3, // + + U4, U4, U4, U4, U4, U4, U4, U4, // lead byte = 1111 0xxx (U+10000 - U+1FFFFF) + + U0, U0, U0, U0, // Non valid. + U0, U0, U0, U0, // Non valid. + }; + + for( unsigned int index = 0; index < 256u; ++index ) + { + if( GetUtf8Length( index ) != UTF8_LENGTH[static_cast(index)] ) + { + tet_result(TET_FAIL); + } + } + + tet_result(TET_PASS); + END_TEST; +} + + int UtcDaliTextCharacterSetConversionGetNumberOfUtf8Characters(void) { ToolkitTestApplication application; diff --git a/automated-tests/src/dali-toolkit-internal/utc-Dali-Text-Segmentation.cpp b/automated-tests/src/dali-toolkit-internal/utc-Dali-Text-Segmentation.cpp index 2e60ee5..0023240 100644 --- a/automated-tests/src/dali-toolkit-internal/utc-Dali-Text-Segmentation.cpp +++ b/automated-tests/src/dali-toolkit-internal/utc-Dali-Text-Segmentation.cpp @@ -29,8 +29,12 @@ using namespace Toolkit; using namespace Text; // Tests the following functions with different scripts. -// void SetLineBreakInfo( const Vector& text, Vector& lineBreakInfo ); -// void SetWordBreakInfo( const Vector& text, Vector& wordBreakInfo ); +// void SetLineBreakInfo( const Vector& text, +// Vector& lineBreakInfo ); +// void SetWordBreakInfo( const Vector& text, +// CharacterIndex startIndex, +// Length numberOfCharacters, +// Vector& wordBreakInfo ); ////////////////////////////////////////////////////////// @@ -39,9 +43,11 @@ namespace struct BreakInfoData { - std::string description; ///< Description of the test. - std::string text; ///< input text. - std::string breakInfo; ///< The expected break info. + std::string description; ///< Description of the test. + std::string text; ///< input text. + uint32_t index; ///< The index from where to start to query the break info. + uint32_t numberOfCharacters; ///< The requested number of characters. + std::string breakInfo; ///< The expected break info. }; bool LineBreakInfoTest( const BreakInfoData& data ) @@ -53,6 +59,7 @@ bool LineBreakInfoTest( const BreakInfoData& data ) const uint32_t numberOfCharacters = Utf8ToUtf32( reinterpret_cast( data.text.c_str() ), data.text.size(), &utf32[0u] ); + utf32.Resize( numberOfCharacters ); // 2) Set the line break info. @@ -69,7 +76,14 @@ bool LineBreakInfoTest( const BreakInfoData& data ) breakInfo << static_cast( lineBreakInfo[index] ); } - return data.breakInfo == breakInfo.str(); + if( data.breakInfo != breakInfo.str() ) + { + std::cout << " expected : [" << data.breakInfo << "]" << std::endl; + std::cout << " got : [" << breakInfo.str() << "]" << std::endl; + return false; + } + + return true; } bool WordBreakInfoTest( const BreakInfoData& data ) @@ -81,15 +95,34 @@ bool WordBreakInfoTest( const BreakInfoData& data ) const uint32_t numberOfCharacters = Utf8ToUtf32( reinterpret_cast( data.text.c_str() ), data.text.size(), &utf32[0u] ); + utf32.Resize( numberOfCharacters ); - // 2) Set the word break info. + // 2) Set the word break info for the whole text. Vector wordBreakInfo; wordBreakInfo.Resize( numberOfCharacters ); - SetWordBreakInfo( utf32, wordBreakInfo ); + SetWordBreakInfo( utf32, + 0u, + numberOfCharacters, + wordBreakInfo ); - // 3) compare the results + // 3) Update the word text info if it's requested for part of the text. + if( ( 0u != data.index ) && + ( numberOfCharacters != data.numberOfCharacters ) ) + { + // Clear part of the word break info. + wordBreakInfo.Erase( wordBreakInfo.Begin() + data.index, + wordBreakInfo.Begin() + data.index + data.numberOfCharacters ); + + // Update the word break info. + SetWordBreakInfo( utf32, + data.index, + data.numberOfCharacters, + wordBreakInfo ); + } + + // 4) compare the results std::ostringstream breakInfo; for( unsigned int index = 0u; index < numberOfCharacters; ++index ) @@ -97,7 +130,17 @@ bool WordBreakInfoTest( const BreakInfoData& data ) breakInfo << static_cast( wordBreakInfo[index] ); } - return data.breakInfo == breakInfo.str(); + if( data.breakInfo != breakInfo.str() ) + { + std::cout << " text : [" << data.text << "]" << std::endl; + std::cout << " index : " << data.index << std::endl; + std::cout << " numberOfCharacters : " << data.numberOfCharacters << std::endl; + std::cout << " expected : [" << data.breakInfo << "]" << std::endl; + std::cout << " got : [" << breakInfo.str() << "]" << std::endl; + return false; + } + + return true; } } // namespace @@ -114,40 +157,49 @@ int UtcDaliTextSegnemtationSetLineBreakInfo(void) { "Zero characters", "", + 0u, + 0u, "", }, { "Latin script", - "Hello world", - "22222122220", - }, - { - "Latin script with \n", - "Hello\nworld", - "22222022220", + "Lorem ipsum dolor sit amet, aeque definiebas ea mei, posse iracundia ne cum.\n" + "Usu ne nisl maiorum iudicabit, veniam epicurei oporteat eos an.\n" + "Ne nec nulla regione albucius, mea doctus delenit ad!\n" + "Et everti blandit adversarium mei, eam porro neglegentur suscipiantur an.\n" + "Quidam corpora at duo. An eos possim scripserit?", + 0u, + 317u, + "22222122222122222122212222212222212222222222122122221222221222222222122122220" + "2221221222212222222122222222221222222122222222122222222122212220" + "221222122222122222221222222222122212222221222222212220" + "22122222212222222122222222222122221222122222122222222222122222222222212220" + "222222122222221221222212212221222222122222222220", }, { "Japanese script", - "こんにちは世界", - "1111110", - }, - { - "Japanese script with \n", - "こんにちは\n世界", - "11112010", + "韓国側は北朝鮮当局を通じて米ドルで賃金を支払う。\n" + "国際社会から様々な経済制裁を受ける北朝鮮にとっては出稼ぎ労働などと並んで重要な外貨稼ぎの手段となっている。\n" + "韓国統一省によると15年だけで1320億ウォン(約130億円)が同工業団地を通じ北朝鮮に支払われたという。", + 0u, + 132u, + "1111111111111111111111220" + "111111211111111111111111111111111111111111111111111220" + "11111111121111122211111212211211111111111111111111120", }, { "Chinese script", - "你好世界", - "1110", - }, - { - "Chinese script with \n", - "你好\n世界", - "12010", + "在被捕的64人中,警方落案起訴了35名男子和3名女子,他們年齡介乎15到70歲。\n" + "38人中有1人獲准保釋。\n" + "16名年齡介乎14到33歲的被捕人士獲准保釋候查,另有10人仍被拘留作進一步調查。", + 0u, + 95u, + "11112112111111112111111112111111121121220" + "2111111111220" + "21111112112111111111111211121111111111120", } }; - const unsigned int numberOfTests = 7u; + const unsigned int numberOfTests = 4u; for( unsigned int index = 0u; index < numberOfTests; ++index ) { @@ -169,42 +221,162 @@ int UtcDaliTextSegnemtationSetWordBreakInfo(void) struct BreakInfoData data[] = { { - "Zero characters", + "Zero characters.", "", + 0u, + 0u, "", }, { - "Latin script", - "Hello world", - "11110011110", + "Latin script, full text.", + "Lorem ipsum dolor sit amet, aeque definiebas ea mei, posse iracundia ne cum.\n" + "Usu ne nisl maiorum iudicabit, veniam epicurei oporteat eos an.\n" + "Ne nec nulla regione albucius, mea doctus delenit ad!\n" + "Et everti blandit adversarium mei, eam porro neglegentur suscipiantur an.\n" + "Quidam corpora at duo. An eos possim scripserit?", + 0u, + 317u, + "11110011110011110011001110001111001111111110010011000111100111111110010011000" + "1100100111001111110011111111000111110011111110011111110011001000" + "100110011110011111100111111100011001111100111111001000" + "10011111001111110011111111110011000110011110011111111110011111111111001000" + "111110011111100100110001001100111110011111111100", }, { - "Latin script with \n", - "Hello\nworld", - "11110011110", + "Latin script, update first paragraph.", + "Lorem ipsum dolor sit amet, aeque definiebas ea mei, posse iracundia ne cum.\n" + "Usu ne nisl maiorum iudicabit, veniam epicurei oporteat eos an.\n" + "Ne nec nulla regione albucius, mea doctus delenit ad!\n" + "Et everti blandit adversarium mei, eam porro neglegentur suscipiantur an.\n" + "Quidam corpora at duo. An eos possim scripserit?", + 0u, + 77u, + "11110011110011110011001110001111001111111110010011000111100111111110010011000" + "1100100111001111110011111111000111110011111110011111110011001000" + "100110011110011111100111111100011001111100111111001000" + "10011111001111110011111111110011000110011110011111111110011111111111001000" + "111110011111100100110001001100111110011111111100", }, { - "Japanese script", - "こんにちは世界", - "0000000", + "Latin script, update middle paragraphs.", + "Lorem ipsum dolor sit amet, aeque definiebas ea mei, posse iracundia ne cum.\n" + "Usu ne nisl maiorum iudicabit, veniam epicurei oporteat eos an.\n" + "Ne nec nulla regione albucius, mea doctus delenit ad!\n" + "Et everti blandit adversarium mei, eam porro neglegentur suscipiantur an.\n" + "Quidam corpora at duo. An eos possim scripserit?", + 77u, + 118u, + "11110011110011110011001110001111001111111110010011000111100111111110010011000" + "1100100111001111110011111111000111110011111110011111110011001000" + "100110011110011111100111111100011001111100111111001000" + "10011111001111110011111111110011000110011110011111111110011111111111001000" + "111110011111100100110001001100111110011111111100", }, { - "Japanese script with \n", - "こんにちは\n世界", - "00000000", + "Latin script, update last paragraph.", + "Lorem ipsum dolor sit amet, aeque definiebas ea mei, posse iracundia ne cum.\n" + "Usu ne nisl maiorum iudicabit, veniam epicurei oporteat eos an.\n" + "Ne nec nulla regione albucius, mea doctus delenit ad!\n" + "Et everti blandit adversarium mei, eam porro neglegentur suscipiantur an.\n" + "Quidam corpora at duo. An eos possim scripserit?", + 269u, + 48u, + "11110011110011110011001110001111001111111110010011000111100111111110010011000" + "1100100111001111110011111111000111110011111110011111110011001000" + "100110011110011111100111111100011001111100111111001000" + "10011111001111110011111111110011000110011110011111111110011111111111001000" + "111110011111100100110001001100111110011111111100", }, { - "Chinese script", - "你好世界", - "0000", + "Japanese script, full text.", + "韓国側は北朝鮮当局を通じて米ドルで賃金を支払う。\n" + "国際社会から様々な経済制裁を受ける北朝鮮にとっては出稼ぎ労働などと並んで重要な外貨稼ぎの手段となっている。\n" + "韓国統一省によると15年だけで1320億ウォン(約130億円)が同工業団地を通じ北朝鮮に支払われたという。", + 0u, + 132u, + "0000000000000010000000000" + "000000000000000000000000000000000000000000000000000000" + "00000000010000011100110001100000000000000000000000000", + }, + { + "Japanese script, update first paragraph.", + "韓国側は北朝鮮当局を通じて米ドルで賃金を支払う。\n" + "国際社会から様々な経済制裁を受ける北朝鮮にとっては出稼ぎ労働などと並んで重要な外貨稼ぎの手段となっている。\n" + "韓国統一省によると15年だけで1320億ウォン(約130億円)が同工業団地を通じ北朝鮮に支払われたという。", + 0u, + 25u, + "0000000000000010000000000" + "000000000000000000000000000000000000000000000000000000" + "00000000010000011100110001100000000000000000000000000", + }, + { + "Japanese script, update middle paragraph.", + "韓国側は北朝鮮当局を通じて米ドルで賃金を支払う。\n" + "国際社会から様々な経済制裁を受ける北朝鮮にとっては出稼ぎ労働などと並んで重要な外貨稼ぎの手段となっている。\n" + "韓国統一省によると15年だけで1320億ウォン(約130億円)が同工業団地を通じ北朝鮮に支払われたという。", + 25u, + 54u, + "0000000000000010000000000" + "000000000000000000000000000000000000000000000000000000" + "00000000010000011100110001100000000000000000000000000", + }, + { + "Japanese script, update last paragraph.", + "韓国側は北朝鮮当局を通じて米ドルで賃金を支払う。\n" + "国際社会から様々な経済制裁を受ける北朝鮮にとっては出稼ぎ労働などと並んで重要な外貨稼ぎの手段となっている。\n" + "韓国統一省によると15年だけで1320億ウォン(約130億円)が同工業団地を通じ北朝鮮に支払われたという。", + 79u, + 53u, + "0000000000000010000000000" + "000000000000000000000000000000000000000000000000000000" + "00000000010000011100110001100000000000000000000000000", + }, + { + "Chinese script, full text.", + "在被捕的64人中,警方落案起訴了35名男子和3名女子,他們年齡介乎15到70歲。\n" + "38人中有1人獲准保釋。\n" + "16名年齡介乎14到33歲的被捕人士獲准保釋候查,另有10人仍被拘留作進一步調查。", + 0u, + 95u, + "00001000000000001000000000000000010010000" + "1000000000000" + "10000001001000000000000000010000000000000", + }, + { + "Chinese script, update first paragraph.", + "在被捕的64人中,警方落案起訴了35名男子和3名女子,他們年齡介乎15到70歲。\n" + "38人中有1人獲准保釋。\n" + "16名年齡介乎14到33歲的被捕人士獲准保釋候查,另有10人仍被拘留作進一步調查。", + 0u, + 41u, + "00001000000000001000000000000000010010000" + "1000000000000" + "10000001001000000000000000010000000000000", + }, + { + "Chinese script, update middle paragraph.", + "在被捕的64人中,警方落案起訴了35名男子和3名女子,他們年齡介乎15到70歲。\n" + "38人中有1人獲准保釋。\n" + "16名年齡介乎14到33歲的被捕人士獲准保釋候查,另有10人仍被拘留作進一步調查。", + 41u, + 13u, + "00001000000000001000000000000000010010000" + "1000000000000" + "10000001001000000000000000010000000000000", }, { - "Chinese script with \n", - "你好\n世界", - "00000", + "Chinese script, update last paragraph.", + "在被捕的64人中,警方落案起訴了35名男子和3名女子,他們年齡介乎15到70歲。\n" + "38人中有1人獲准保釋。\n" + "16名年齡介乎14到33歲的被捕人士獲准保釋候查,另有10人仍被拘留作進一步調查。", + 54u, + 41u, + "00001000000000001000000000000000010010000" + "1000000000000" + "10000001001000000000000000010000000000000", } }; - const unsigned int numberOfTests = 7u; + const unsigned int numberOfTests = 13u; for( unsigned int index = 0u; index < numberOfTests; ++index ) { diff --git a/dali-toolkit/internal/text/segmentation.cpp b/dali-toolkit/internal/text/segmentation.cpp index a7c7135..5e590b5 100644 --- a/dali-toolkit/internal/text/segmentation.cpp +++ b/dali-toolkit/internal/text/segmentation.cpp @@ -85,29 +85,59 @@ void SetLineBreakInfo( const Vector& text, } void SetWordBreakInfo( const Vector& text, + CharacterIndex startIndex, + Length numberOfCharacters, Vector& wordBreakInfo ) { - const Length numberOfCharacters = text.Count(); + const Length totalNumberOfCharacters = text.Count(); - if( 0u == numberOfCharacters ) + if( 0u == totalNumberOfCharacters ) { // Nothing to do if there are no characters. return; } + // Resize the vector. + wordBreakInfo.Resize( totalNumberOfCharacters ); + + // Whether the current buffer is being updated or is set from scratch. + const bool updateCurrentBuffer = numberOfCharacters < totalNumberOfCharacters; + + WordBreakInfo* wordBreakInfoBuffer = NULL; + Vector newWordBreakInfo; + + if( updateCurrentBuffer ) + { + newWordBreakInfo.Resize( numberOfCharacters ); + wordBreakInfoBuffer = newWordBreakInfo.Begin(); + } + else + { + wordBreakInfoBuffer = wordBreakInfo.Begin(); + } + // Retrieve the word break info. - wordBreakInfo.Resize( numberOfCharacters ); - TextAbstraction::Segmentation::Get().GetWordBreakPositions( text.Begin(), + TextAbstraction::Segmentation::Get().GetWordBreakPositions( text.Begin() + startIndex, numberOfCharacters, - wordBreakInfo.Begin() ); + wordBreakInfoBuffer ); + + // If the word break info is updated, it needs to be inserted in the model. + if( updateCurrentBuffer ) + { + wordBreakInfo.Insert( wordBreakInfo.Begin() + startIndex, + newWordBreakInfo.Begin(), + newWordBreakInfo.End() ); + wordBreakInfo.Resize( totalNumberOfCharacters ); + } + #ifdef DEBUG_ENABLED if( gLogFilter->IsEnabledFor(Debug::Verbose) ) { std::string utf8; - Utf32ToUtf8( text.Begin(), numberOfCharacters, utf8 ); + Utf32ToUtf8( text.Begin(), totalNumberOfCharacters, utf8 ); std::string info; - info.reserve( numberOfCharacters ); + info.reserve( totalNumberOfCharacters ); for( unsigned int i=0; i('0' + wordBreakInfo[i]) ); diff --git a/dali-toolkit/internal/text/segmentation.h b/dali-toolkit/internal/text/segmentation.h index 4bbe5d5..61ddcc7 100644 --- a/dali-toolkit/internal/text/segmentation.h +++ b/dali-toolkit/internal/text/segmentation.h @@ -57,9 +57,13 @@ void SetLineBreakInfo( const Vector& text, * - 1 is a WORD_NO_BREAK. Text can't be broken into a new word. * * @param[in] text Vector of UTF-32 characters. + * @param[in] startIndex The character from where the break info is set. + * @param[in] numberOfCharacters The number of characters. * @param[out] wordBreakInfo The word break info. */ void SetWordBreakInfo( const Vector& text, + CharacterIndex startIndex, + Length numberOfCharacters, Vector& wordBreakInfo ); } // namespace Text diff --git a/dali-toolkit/internal/text/text-controller-impl.cpp b/dali-toolkit/internal/text/text-controller-impl.cpp index 9c06327..4ac12b7 100644 --- a/dali-toolkit/internal/text/text-controller-impl.cpp +++ b/dali-toolkit/internal/text/text-controller-impl.cpp @@ -325,6 +325,8 @@ void Controller::Impl::UpdateModel( OperationsMask operationsRequired ) const Length numberOfCharacters = utf32Characters.Count(); Vector& lineBreakInfo = mLogicalModel->mLineBreakInfo; + CharacterIndex startIndex = 0u; + Length requestedNumberOfCharacters = numberOfCharacters; if( GET_LINE_BREAKS & operations ) { // Retrieves the line break info. The line break info is used to split the text in 'paragraphs' to @@ -344,6 +346,8 @@ void Controller::Impl::UpdateModel( OperationsMask operationsRequired ) wordBreakInfo.Resize( numberOfCharacters, TextAbstraction::WORD_NO_BREAK ); SetWordBreakInfo( utf32Characters, + startIndex, + requestedNumberOfCharacters, wordBreakInfo ); } -- 2.7.4