|
// Licensed to the .NET Foundation under one or more agreements.
// The .NET Foundation licenses this file to you under the MIT license.
// See the LICENSE file in the project root for more information.
using MS.Internal.WindowsRuntime.Windows.Data.Text;
using System.Windows.Documents.MsSpellCheckLib;
using static System.Windows.Documents.WinRTSpellerInterop;
namespace System.Windows.Documents
{
internal static class WinRTSpellerInteropExtensions
{
/// <summary>
/// Tokenizes <paramref name="text"/> using <paramref name="segmenter"/>, and then identifies fixes-up
/// the tokens to account for any missed text "in-between" those tokens.
/// </summary>
/// <param name="segmenter">Word-breaker instance</param>
/// <param name="text">The text being tokenized</param>
/// <param name="spellChecker">The spell-checker instance used to augment the tokenizing process</param>
/// <param name="owner">Calling <see cref="WinRTSpellerInterop"/> instance</param>
/// <returns></returns>
/// <remarks>
/// Windows.Data.Text.WordsSegmenter tends to drop punctuation characters like period ('.')
/// when tokenizing text. Though this behavior is compatible with a vast majority of text-processing
/// scenarios (like word-counting), it is not ideal for spell-checking.
///
/// In this method, the following <paramref name="spellChecker"/> augmented heuristic is applied to update the token-list generated by
/// <paramref name="segmenter"/>.
///
/// - Identify if any text 'missingFragment' has been dropped by the <paramref name="segmenter"/>
/// - If the token immediately preceding 'missingFragment', previousToken, has a spelling error, then attempt to
/// create new candiate tokens in the following order:
///
/// previousToken + missingFragment[0..0]
/// previousToken + missingFragment[0..1]
/// previousToken + missingFragment[0..2]
/// ...
/// ...
/// previousToken + missingFragment[0..LEN-1], where LEN = LEN(missingFragment)
///
/// - Select the first candidate token that is free of spelling errors, and replace 'previousToken' with it.
/// - For performance reasons, we choose a constant MAXLEN = 4 such that when LEN > MAXLEN, only MAXLEN
/// tokens are considered.
/// - MAXLEN = 4 is a somewhat arbitrary choice, though it seems more than sufficient to address common
/// problems this heuristic is intended to help with.
///
/// - Typical word-breaking problems that have been observed empirically involve only one missed character,
/// for which MAXLEN=1 would be sufficient. MAXLEN=4 is chosen as a sufficiently-large tradeoff between
/// correctness and performance.
///
/// - Also see https://github.com/dotnet/wpf/pull/2753#issuecomment-602120768 for a discussion related to this.
/// </remarks>
public static IReadOnlyList<SpellerSegment> ComprehensiveGetTokens(
this WordsSegmenter segmenter,
string text,
SpellChecker spellChecker,
WinRTSpellerInterop owner)
{
IReadOnlyList<WordSegment> tokens = segmenter?.GetTokens(text) ?? Array.Empty<WordSegment>();
if (tokens.Count == 0)
{
return Array.Empty<SpellerSegment>();
}
var allTokens = new List<SpellerSegment>();
int predictedNextTokenStartPosition = 0;
for (int i = 0; i < tokens.Count; i++)
{
int nextTokenStartPosition = (int)tokens[i].SourceTextSegment.StartPosition;
int nextTokenLength = (int)tokens[i].SourceTextSegment.Length;
if (spellChecker != null)
{
if (nextTokenStartPosition > predictedNextTokenStartPosition)
{
// There is a "gap" between the last recorded token and the current token.
// Identify the missing token and add it as a "supplementary word segment" - but only if the token
// turns out to be a substantial one (i.e., if the string is non-blank/non-empty).
var missingFragment =
new SpellerSegment(
text,
new WinRTSpellerInterop.TextRange(
predictedNextTokenStartPosition,
nextTokenStartPosition - predictedNextTokenStartPosition),
spellChecker,
owner);
if (allTokens.Count > 0)
{
var substToken = GetSpellCheckCleanSubstitutionToken(spellChecker, text, allTokens[allTokens.Count - 1], missingFragment);
if (substToken != null)
{
allTokens[allTokens.Count - 1] = new SpellerSegment(text, substToken.Value, spellChecker, owner);
}
}
}
}
allTokens.Add(
new SpellerSegment(
text,
new WinRTSpellerInterop.TextRange(
nextTokenStartPosition,
nextTokenLength),
spellChecker,
owner));
predictedNextTokenStartPosition = nextTokenStartPosition + nextTokenLength;
}
if (tokens.Count > 0 &&
spellChecker != null &&
spellChecker.HasErrors(tokens[tokens.Count - 1].Text) &&
predictedNextTokenStartPosition < text.Length)
{
// There is a token possibly missing at the end of the string
var missingFragment =
new SpellerSegment(
text,
new WinRTSpellerInterop.TextRange(
predictedNextTokenStartPosition,
text.Length - predictedNextTokenStartPosition),
spellChecker,
owner);
if (allTokens.Count > 0)
{
var substToken = GetSpellCheckCleanSubstitutionToken(spellChecker, text, allTokens[allTokens.Count - 1], missingFragment);
if (substToken != null)
{
allTokens[allTokens.Count - 1] = new SpellerSegment(text, substToken.Value, spellChecker, owner);
}
}
}
return allTokens.AsReadOnly();
}
/// <summary>
/// Checks through combinations of <paramref name="lastToken"/> + substrings(<paramref name="missingFragment"/>) and
/// returns the first spellcheck-clean result.
/// </summary>
/// <param name="spellChecker">Spell-checker</param>
/// <param name="documentText">Overall document text within which the text-ranges are computed</param>
/// <param name="lastToken">Previous token immediately preceding <paramref name="missingFragment"/></param>
/// <param name="missingFragment">The missing-fragment identified immediately after <paramref name="lastToken"/></param>
/// <returns></returns>
/// <remarks>
/// See note about MAXLEN in <see cref="ComprehensiveGetTokens(WordsSegmenter, string, SpellChecker, WinRTSpellerInterop)"/>
/// which explains the rationale behind the value of the constant AlternateFormsMaximumCount.
/// </remarks>
private static WinRTSpellerInterop.TextRange? GetSpellCheckCleanSubstitutionToken(
SpellChecker spellChecker,
string documentText,
SpellerSegment lastToken,
SpellerSegment missingFragment)
{
const int AlternateFormsMaximumCount = 4;
string lastTokenText = lastToken?.Text;
string missingFragmentText = missingFragment?.Text.TrimEnd('\0');
if (string.IsNullOrWhiteSpace(missingFragmentText) ||
string.IsNullOrWhiteSpace(lastTokenText) ||
string.IsNullOrWhiteSpace(documentText) ||
spellChecker == null ||
!spellChecker.HasErrors(lastTokenText))
{
return null;
}
string previousAltForm = lastTokenText;
int altFormsCount = Math.Min(missingFragmentText.Length, AlternateFormsMaximumCount);
// One of the substring-permutations of the missingFragment - when concatenated with 'lastToken' - could be a viable
// replacement for 'lastToken'
for (int i = 1; i <= altFormsCount; i++)
{
var altForm = documentText.Substring(lastToken.TextRange.Start, lastTokenText.Length + i).TrimEnd('\0').TrimEnd();
if (altForm.Length > previousAltForm.Length)
{
previousAltForm = altForm;
if (!spellChecker.HasErrors(altForm))
{
// Use this altForm in place lastToken
return new WinRTSpellerInterop.TextRange(
lastToken.TextRange.Start,
altForm.Length);
}
}
else
{
// trimming yielded an altForm we've already checked, don't check again.
// We could stop checking now if we knew that an altForm with
// embedded trimmable characters could never be a correctly spelled
// token. That's probably true, but the spell-checking docs don't
// guarantee it. So for safety, leaving the next line commented out.
// return null;
}
}
return null;
}
}
}
|