From a7544884bb3be5f4756d629a1a69c9eed2815c8e Mon Sep 17 00:00:00 2001 From: Nikolaj Olsson Date: Fri, 21 Dec 2018 18:17:01 +0100 Subject: [PATCH] Work on xml content translator --- .gitignore | 1 + src/XmlContentTranslator/Main.Designer.cs | 10 +- src/XmlContentTranslator/Main.cs | 133 +--- .../Properties/AssemblyInfo.cs | 4 +- .../Translator/Configuration.cs | 8 + .../Translator/Formatting.cs | 128 ++++ .../Translator/GoogleTranslator1.cs | 301 ++++++++ .../Translator/HtmlUtil.cs | 723 ++++++++++++++++++ .../Translator/ITranslator.cs | 13 + src/XmlContentTranslator/Translator/Json.cs | 54 ++ .../Translator/StringExtensions.cs | 274 +++++++ .../Translator/TranslationHelper.cs | 64 ++ .../Translator/TranslationPair.cs | 24 + .../Translator/Utilities.cs | 683 +++++++++++++++++ .../XmlContentTranslator.csproj | 10 + .../XmlContentTranslator.csproj.user | 6 + 16 files changed, 2322 insertions(+), 114 deletions(-) create mode 100644 src/XmlContentTranslator/Translator/Configuration.cs create mode 100644 src/XmlContentTranslator/Translator/Formatting.cs create mode 100644 src/XmlContentTranslator/Translator/GoogleTranslator1.cs create mode 100644 src/XmlContentTranslator/Translator/HtmlUtil.cs create mode 100644 src/XmlContentTranslator/Translator/ITranslator.cs create mode 100644 src/XmlContentTranslator/Translator/Json.cs create mode 100644 src/XmlContentTranslator/Translator/StringExtensions.cs create mode 100644 src/XmlContentTranslator/Translator/TranslationHelper.cs create mode 100644 src/XmlContentTranslator/Translator/TranslationPair.cs create mode 100644 src/XmlContentTranslator/Translator/Utilities.cs create mode 100644 src/XmlContentTranslator/XmlContentTranslator.csproj.user diff --git a/.gitignore b/.gitignore index a32af7e..1906eb5 100644 --- a/.gitignore +++ b/.gitignore @@ -2,3 +2,4 @@ bin obj /src/XmlContentTranslator/XmlContentTranslator.csproj.DotSettings /src/XmlContentTranslator/XmlContentTranslator.zip +/src/.vs/XmlContentTranslator/v15 diff --git a/src/XmlContentTranslator/Main.Designer.cs b/src/XmlContentTranslator/Main.Designer.cs index 833633d..431814d 100644 --- a/src/XmlContentTranslator/Main.Designer.cs +++ b/src/XmlContentTranslator/Main.Designer.cs @@ -90,19 +90,19 @@ private void InitializeComponent() this.translateViaGoogleToolStripMenuItem, this.setValueFromMasterToolStripMenuItem}); this.contextMenuStrip1.Name = "contextMenuStrip1"; - this.contextMenuStrip1.Size = new System.Drawing.Size(217, 48); + this.contextMenuStrip1.Size = new System.Drawing.Size(216, 48); // // translateViaGoogleToolStripMenuItem // this.translateViaGoogleToolStripMenuItem.Name = "translateViaGoogleToolStripMenuItem"; - this.translateViaGoogleToolStripMenuItem.Size = new System.Drawing.Size(216, 22); + this.translateViaGoogleToolStripMenuItem.Size = new System.Drawing.Size(215, 22); this.translateViaGoogleToolStripMenuItem.Text = "Translate via google"; this.translateViaGoogleToolStripMenuItem.Click += new System.EventHandler(this.translateViaGoogleToolStripMenuItem_Click); // // setValueFromMasterToolStripMenuItem // this.setValueFromMasterToolStripMenuItem.Name = "setValueFromMasterToolStripMenuItem"; - this.setValueFromMasterToolStripMenuItem.Size = new System.Drawing.Size(216, 22); + this.setValueFromMasterToolStripMenuItem.Size = new System.Drawing.Size(215, 22); this.setValueFromMasterToolStripMenuItem.Text = "Transfer value from master"; this.setValueFromMasterToolStripMenuItem.Click += new System.EventHandler(this.setValueFromMasterToolStripMenuItem_Click); // @@ -203,7 +203,7 @@ private void InitializeComponent() this.toolsToolStripMenuItem.DropDownItems.AddRange(new System.Windows.Forms.ToolStripItem[] { this.googleTranslateSelectedLinesToolStripMenuItem}); this.toolsToolStripMenuItem.Name = "toolsToolStripMenuItem"; - this.toolsToolStripMenuItem.Size = new System.Drawing.Size(48, 20); + this.toolsToolStripMenuItem.Size = new System.Drawing.Size(47, 20); this.toolsToolStripMenuItem.Text = "&Tools"; // // googleTranslateSelectedLinesToolStripMenuItem @@ -332,7 +332,7 @@ private void InitializeComponent() this.MainMenuStrip = this.menuStrip1; this.MinimumSize = new System.Drawing.Size(820, 350); this.Name = "Main"; - this.Text = "XML Content Translator 1.6"; + this.Text = "XML Content Translator 1.7"; this.FormClosing += new System.Windows.Forms.FormClosingEventHandler(this.Form1FormClosing); this.Load += new System.EventHandler(this.Form1Load); this.KeyDown += new System.Windows.Forms.KeyEventHandler(this.Form1KeyDown); diff --git a/src/XmlContentTranslator/Main.cs b/src/XmlContentTranslator/Main.cs index a1768c6..ee27ad5 100644 --- a/src/XmlContentTranslator/Main.cs +++ b/src/XmlContentTranslator/Main.cs @@ -3,11 +3,13 @@ using System.Collections.Generic; using System.Drawing; using System.IO; +using System.Linq; using System.Net; using System.Text; using System.Web; using System.Windows.Forms; using System.Xml; +using XmlContentTranslator.Translator; namespace XmlContentTranslator { @@ -601,35 +603,30 @@ private void GoogleTranslateSelectedLines() string oldText = string.Empty; string newText = string.Empty; - if (listViewLanguageTags.SelectedItems.Count > 10) - { - toolStripStatusLabel1.Text = "Translating via Google Translate. Please wait..."; - Refresh(); - } + toolStripStatusLabel1.Text = "Translating via Google Translate. Please wait..."; + Refresh(); + var translator = new GoogleTranslator1(); Cursor = Cursors.WaitCursor; var sb = new StringBuilder(); var res = new StringBuilder(); var oldLines = new List(); + var list = new List(); foreach (ListViewItem item in listViewLanguageTags.SelectedItems) { oldText = item.SubItems[1].Text; oldLines.Add(oldText); var urlEncode = HttpUtility.UrlEncode(sb + newText); - if (urlEncode != null && urlEncode.Length >= 1000) + if (urlEncode.Length >= 1000) { res.Append(TranslateTextViaScreenScraping(sb.ToString(), (comboBoxFrom.SelectedItem as ComboBoxItem).Value + "|" + (comboBoxTo.SelectedItem as ComboBoxItem).Value)); sb = new StringBuilder(); } + list.Add(oldText); sb.Append("== " + oldText + " "); } - res.Append(TranslateTextViaScreenScraping(sb.ToString(), (comboBoxFrom.SelectedItem as ComboBoxItem).Value + "|" + (comboBoxTo.SelectedItem as ComboBoxItem).Value)); - - var lines = new List(); - foreach (string s in res.ToString().Split(new string[] { "==" }, StringSplitOptions.None)) - lines.Add(s.Trim()); - lines.RemoveAt(0); - + var log = new StringBuilder(); + var lines = translator.Translate(((ComboBoxItem)comboBoxFrom.SelectedItem).Value, ((ComboBoxItem)comboBoxTo.SelectedItem).Value, list, log).ToList(); if (listViewLanguageTags.SelectedItems.Count != lines.Count) { MessageBox.Show("Error getting/decoding translation from google!"); @@ -731,97 +728,10 @@ private void setValueFromMasterToolStripMenuItem_Click(object sender, EventArgs public static void FillComboWithLanguages(ComboBox comboBox) { - comboBox.Items.Add(new ComboBoxItem("AFRIKAANS", "af")); - comboBox.Items.Add(new ComboBoxItem("ALBANIAN", "sq")); - comboBox.Items.Add(new ComboBoxItem("AMHARIC", "am")); - comboBox.Items.Add(new ComboBoxItem("ARABIC", "ar")); - comboBox.Items.Add(new ComboBoxItem("ARMENIAN", "hy")); - comboBox.Items.Add(new ComboBoxItem("AZERBAIJANI", "az")); - comboBox.Items.Add(new ComboBoxItem("BASQUE", "eu")); - comboBox.Items.Add(new ComboBoxItem("BELARUSIAN", "be")); - comboBox.Items.Add(new ComboBoxItem("BENGALI", "bn")); - comboBox.Items.Add(new ComboBoxItem("BIHARI", "bh")); - comboBox.Items.Add(new ComboBoxItem("BULGARIAN", "bg")); - comboBox.Items.Add(new ComboBoxItem("BURMESE", "my")); - comboBox.Items.Add(new ComboBoxItem("CATALAN", "ca")); - comboBox.Items.Add(new ComboBoxItem("CHEROKEE", "chr")); - comboBox.Items.Add(new ComboBoxItem("CHINESE", "zh")); - comboBox.Items.Add(new ComboBoxItem("CHINESE_SIMPLIFIED", "zh-CN")); - comboBox.Items.Add(new ComboBoxItem("CHINESE_TRADITIONAL", "zh-TW")); - comboBox.Items.Add(new ComboBoxItem("CROATIAN", "hr")); - comboBox.Items.Add(new ComboBoxItem("CZECH", "cs")); - comboBox.Items.Add(new ComboBoxItem("DANISH", "da")); - comboBox.Items.Add(new ComboBoxItem("DHIVEHI", "dv")); - comboBox.Items.Add(new ComboBoxItem("DUTCH", "nl")); - comboBox.Items.Add(new ComboBoxItem("ENGLISH", "en")); - comboBox.Items.Add(new ComboBoxItem("ESPERANTO", "eo")); - comboBox.Items.Add(new ComboBoxItem("ESTONIAN", "et")); - comboBox.Items.Add(new ComboBoxItem("FILIPINO", "tl")); - comboBox.Items.Add(new ComboBoxItem("FINNISH", "fi")); - comboBox.Items.Add(new ComboBoxItem("FRENCH", "fr")); - comboBox.Items.Add(new ComboBoxItem("GALICIAN", "gl")); - comboBox.Items.Add(new ComboBoxItem("GEORGIAN", "ka")); - comboBox.Items.Add(new ComboBoxItem("GERMAN", "de")); - comboBox.Items.Add(new ComboBoxItem("GREEK", "el")); - comboBox.Items.Add(new ComboBoxItem("GUARANI", "gn")); - comboBox.Items.Add(new ComboBoxItem("GUJARATI", "gu")); - comboBox.Items.Add(new ComboBoxItem("HEBREW", "iw")); - comboBox.Items.Add(new ComboBoxItem("HINDI", "hi")); - comboBox.Items.Add(new ComboBoxItem("HUNGARIAN", "hu")); - comboBox.Items.Add(new ComboBoxItem("ICELANDIC", "is")); - comboBox.Items.Add(new ComboBoxItem("IRISH", "ga")); - comboBox.Items.Add(new ComboBoxItem("INDONESIAN", "id")); - comboBox.Items.Add(new ComboBoxItem("INUKTITUT", "iu")); - comboBox.Items.Add(new ComboBoxItem("ITALIAN", "it")); - comboBox.Items.Add(new ComboBoxItem("JAPANESE", "ja")); - comboBox.Items.Add(new ComboBoxItem("KANNADA", "kn")); - comboBox.Items.Add(new ComboBoxItem("KAZAKH", "kk")); - comboBox.Items.Add(new ComboBoxItem("KHMER", "km")); - comboBox.Items.Add(new ComboBoxItem("KOREAN", "ko")); - comboBox.Items.Add(new ComboBoxItem("KURDISH", "ku")); - comboBox.Items.Add(new ComboBoxItem("KYRGYZ", "ky")); - comboBox.Items.Add(new ComboBoxItem("LAOTHIAN", "lo")); - comboBox.Items.Add(new ComboBoxItem("LATVIAN", "lv")); - comboBox.Items.Add(new ComboBoxItem("LITHUANIAN", "lt")); - comboBox.Items.Add(new ComboBoxItem("MACEDONIAN", "mk")); - comboBox.Items.Add(new ComboBoxItem("MALAY", "ms")); - comboBox.Items.Add(new ComboBoxItem("MALAYALAM", "ml")); - comboBox.Items.Add(new ComboBoxItem("MALTESE", "mt")); - comboBox.Items.Add(new ComboBoxItem("MARATHI", "mr")); - comboBox.Items.Add(new ComboBoxItem("MONGOLIAN", "mn")); - comboBox.Items.Add(new ComboBoxItem("NEPALI", "ne")); - comboBox.Items.Add(new ComboBoxItem("NORWEGIAN", "no")); - comboBox.Items.Add(new ComboBoxItem("ORIYA", "or")); - comboBox.Items.Add(new ComboBoxItem("PASHTO", "ps")); - comboBox.Items.Add(new ComboBoxItem("PERSIAN", "fa")); - comboBox.Items.Add(new ComboBoxItem("POLISH", "pl")); - comboBox.Items.Add(new ComboBoxItem("PORTUGUESE", "pt-PT")); - comboBox.Items.Add(new ComboBoxItem("PUNJABI", "pa")); - comboBox.Items.Add(new ComboBoxItem("ROMANIAN", "ro")); - comboBox.Items.Add(new ComboBoxItem("RUSSIAN", "ru")); - comboBox.Items.Add(new ComboBoxItem("SANSKRIT", "sa")); - comboBox.Items.Add(new ComboBoxItem("SERBIAN", "sr")); - comboBox.Items.Add(new ComboBoxItem("SINDHI", "sd")); - comboBox.Items.Add(new ComboBoxItem("SINHALESE", "si")); - comboBox.Items.Add(new ComboBoxItem("SLOVAK", "sk")); - comboBox.Items.Add(new ComboBoxItem("SLOVENIAN", "sl")); - comboBox.Items.Add(new ComboBoxItem("SPANISH", "es")); - comboBox.Items.Add(new ComboBoxItem("SWAHILI", "sw")); - comboBox.Items.Add(new ComboBoxItem("SWEDISH", "sv")); - comboBox.Items.Add(new ComboBoxItem("TAJIK", "tg")); - comboBox.Items.Add(new ComboBoxItem("TAMIL", "ta")); - comboBox.Items.Add(new ComboBoxItem("TAGALOG", "tl")); - comboBox.Items.Add(new ComboBoxItem("TELUGU", "te")); - comboBox.Items.Add(new ComboBoxItem("THAI", "th")); - comboBox.Items.Add(new ComboBoxItem("TIBETAN", "bo")); - comboBox.Items.Add(new ComboBoxItem("TURKISH", "tr")); - comboBox.Items.Add(new ComboBoxItem("UKRAINIAN", "uk")); - comboBox.Items.Add(new ComboBoxItem("URDU", "ur")); - comboBox.Items.Add(new ComboBoxItem("UZBEK", "uz")); - comboBox.Items.Add(new ComboBoxItem("UIGHUR", "ug")); - comboBox.Items.Add(new ComboBoxItem("VIETNAMESE", "vi")); - comboBox.Items.Add(new ComboBoxItem("WELSH", "cy")); - comboBox.Items.Add(new ComboBoxItem("YIDDISH", "yi")); + foreach (var pair in new GoogleTranslator1().GetTranslationPairs()) + { + comboBox.Items.Add(new ComboBoxItem(pair.Name, pair.Code)); + } } private void ToolStripMenuItem1Click(object sender, EventArgs e) @@ -868,10 +778,19 @@ private void SaveToolStripMenuItemClick(object sender, EventArgs e) { FillOriginalDocumentFromSecondLanguage(); - var settings = new XmlWriterSettings { Indent = true }; - using (var writer = XmlWriter.Create(_secondLanguageFileName, settings)) + using (var sw = new StringWriter()) { - _originalDocument.Save(writer); + using (var xw = XmlWriter.Create(sw, new XmlWriterSettings { Indent = true, Encoding = Encoding.UTF8 })) + { + _originalDocument.Save(xw); + var s = sw.ToString(); + if (s.Contains("Subtitle Edit")) + { + s = s.Replace("", ""); + } + s = s.Replace("encoding=\"utf-16\"?", "encoding=\"utf-8\"?"); + File.WriteAllText(_secondLanguageFileName, s, Encoding.UTF8); + } } _change = false; toolStripStatusLabel1.Text = "File saved - " + _secondLanguageFileName; diff --git a/src/XmlContentTranslator/Properties/AssemblyInfo.cs b/src/XmlContentTranslator/Properties/AssemblyInfo.cs index 2da2ba3..d6e45d2 100644 --- a/src/XmlContentTranslator/Properties/AssemblyInfo.cs +++ b/src/XmlContentTranslator/Properties/AssemblyInfo.cs @@ -32,5 +32,5 @@ // You can specify all the values or you can default the Build and Revision Numbers // by using the '*' as shown below: // [assembly: AssemblyVersion("1.0.*")] -[assembly: AssemblyVersion("1.6.0.0")] -[assembly: AssemblyFileVersion("1.6.0.0")] +[assembly: AssemblyVersion("1.7.0.0")] +[assembly: AssemblyFileVersion("1.7.0.0")] diff --git a/src/XmlContentTranslator/Translator/Configuration.cs b/src/XmlContentTranslator/Translator/Configuration.cs new file mode 100644 index 0000000..6d066fd --- /dev/null +++ b/src/XmlContentTranslator/Translator/Configuration.cs @@ -0,0 +1,8 @@ +namespace XmlContentTranslator.Translator +{ + internal static class Configuration + { + public static double CurrentFrameRate = 23.976; + public static string ListViewLineSeparatorString = "
"; + } +} diff --git a/src/XmlContentTranslator/Translator/Formatting.cs b/src/XmlContentTranslator/Translator/Formatting.cs new file mode 100644 index 0000000..51422cd --- /dev/null +++ b/src/XmlContentTranslator/Translator/Formatting.cs @@ -0,0 +1,128 @@ +using System; +using System.Collections.Generic; +using System.Text; + +namespace XmlContentTranslator.Translator +{ + public class Formatting + { + private bool Italic { get; set; } + private bool ItalicTwoLines { get; set; } + private string StartTags { get; set; } + private bool AutoBreak { get; set; } + private bool SquareBrackets { get; set; } + private bool SquareBracketsUppercase { get; set; } + + public string SetTagsAndReturnTrimmed(string text, string source) + { + text = text.Trim(); + + // SSA/ASS tags + if (text.StartsWith("{\\")) + { + var endIndex = text.IndexOf('}'); + if (endIndex > 0) + { + StartTags = text.Substring(0, endIndex + 1); + text = text.Remove(0, endIndex + 1).Trim(); + } + } + + // Italic tags + if (text.StartsWith("", StringComparison.Ordinal) && text.EndsWith("", StringComparison.Ordinal) && text.Contains("" + Environment.NewLine + "") && Utilities.GetNumberOfLines(text) == 2 && Utilities.CountTagInText(text, "") == 1) + { + ItalicTwoLines = true; + text = HtmlUtil.RemoveOpenCloseTags(text, HtmlUtil.TagItalic); + } + else if (text.StartsWith("", StringComparison.Ordinal) && text.EndsWith("", StringComparison.Ordinal) && Utilities.CountTagInText(text, "") == 1) + { + Italic = true; + text = text.Substring(3, text.Length - 7); + } + + // Un-break line + var allowedLanguages = new List { "en", "da", "nl", "de", "sv", "nb", "fr", "it" }; + if (allowedLanguages.Contains(source)) + { + var lines = HtmlUtil.RemoveHtmlTags(text).SplitToLines(); + if (lines.Length == 2 && !string.IsNullOrEmpty(lines[0]) && !string.IsNullOrEmpty(lines[1]) && + char.IsLetterOrDigit(lines[0][lines[0].Length - 1]) && + char.IsLower(lines[1][0])) + { + text = text.Replace(Environment.NewLine, " ").Replace(" ", " "); + AutoBreak = true; + } + } + + // Square brackets + if (text.StartsWith("[", StringComparison.Ordinal) && text.EndsWith("]", StringComparison.Ordinal) && + Utilities.GetNumberOfLines(text) == 1 && Utilities.CountTagInText(text, "[") == 1 && + Utilities.GetNumberOfLines(text) == 1 && Utilities.CountTagInText(text, "]") == 1) + { + if (text == text.ToUpperInvariant()) + SquareBracketsUppercase = true; + else + SquareBrackets = true; + text = text.Replace("[", string.Empty).Replace("]", string.Empty); + } + + return text.Trim(); + } + + public string ReAddFormatting(string text) + { + // Auto-break line + if (AutoBreak) + { + text = Utilities.AutoBreakLine(text); + } + + // Square brackets + if (SquareBracketsUppercase) + { + text = "[" + text.ToUpperInvariant().Trim() + "]"; + } + else if (SquareBrackets) + { + text = "[" + text.Trim() + "]"; + } + + // Italic tags + if (ItalicTwoLines) + { + var sb = new StringBuilder(); + foreach (var line in text.SplitToLines()) + { + sb.AppendLine("" + line + ""); + } + text = sb.ToString().Trim(); + } + else if (Italic) + { + text = "" + text + ""; + } + + // SSA/ASS tags + text = StartTags + text; + + return text; + } + + + private int NumberOfLines { get; set; } + + public string Unbreak(string text, string source) + { + NumberOfLines = source.SplitToLines().Length; + return text.Replace(Environment.NewLine, " ").Replace(" ", " "); + } + + public string Rebreak(string text) + { + if (NumberOfLines == 1) + return text; + return Utilities.AutoBreakLine(text); + } + + } +} diff --git a/src/XmlContentTranslator/Translator/GoogleTranslator1.cs b/src/XmlContentTranslator/Translator/GoogleTranslator1.cs new file mode 100644 index 0000000..2f05d29 --- /dev/null +++ b/src/XmlContentTranslator/Translator/GoogleTranslator1.cs @@ -0,0 +1,301 @@ +using System; +using System.Collections.Generic; +using System.Linq; +using System.Net; +using System.Text; +using System.Text.RegularExpressions; + +namespace XmlContentTranslator.Translator +{ + /// + /// Google translate via Google V1 API - see https://cloud.google.com/translate/ + /// + public class GoogleTranslator1 : ITranslator + { + private const char SplitChar = '\n'; + + public List GetTranslationPairs() + { + return new List + { + new TranslationPair("AFRIKAANS", "af"), + new TranslationPair("ALBANIAN", "sq"), + new TranslationPair("AMHARIC", "am"), + new TranslationPair("ARABIC", "ar"), + new TranslationPair("ARMENIAN", "hy"), + new TranslationPair("AZERBAIJANI", "az"), + new TranslationPair("BASQUE", "eu"), + new TranslationPair("BELARUSIAN", "be"), + new TranslationPair("BENGALI", "bn"), + new TranslationPair("BOSNIAN", "bs"), + new TranslationPair("BULGARIAN", "bg"), + new TranslationPair("BURMESE", "my"), + new TranslationPair("CATALAN", "ca"), + new TranslationPair("CEBUANO", "ceb"), + new TranslationPair("CHICHEWA", "ny"), + new TranslationPair("CHINESE", "zh"), + new TranslationPair("CHINESE_SIMPLIFIED", "zh-CN"), + new TranslationPair("CHINESE_TRADITIONAL", "zh-TW"), + new TranslationPair("CORSICAN", "co"), + new TranslationPair("CROATIAN", "hr"), + new TranslationPair("CZECH", "cs"), + new TranslationPair("DANISH", "da"), + new TranslationPair("DUTCH", "nl"), + new TranslationPair("ENGLISH", "en"), + new TranslationPair("ESPERANTO", "eo"), + new TranslationPair("ESTONIAN", "et"), + new TranslationPair("FILIPINO", "tl"), + new TranslationPair("FINNISH", "fi"), + new TranslationPair("FRENCH", "fr"), + new TranslationPair("FRISIAN", "fy"), + new TranslationPair("GALICIAN", "gl"), + new TranslationPair("GEORGIAN", "ka"), + new TranslationPair("GERMAN", "de"), + new TranslationPair("GREEK", "el"), + new TranslationPair("GUJARATI", "gu"), + new TranslationPair("HAITIAN CREOLE", "ht"), + new TranslationPair("HAUSA", "ha"), + new TranslationPair("HAWAIIAN", "haw"), + new TranslationPair("HEBREW", "iw"), + new TranslationPair("HINDI", "hi"), + new TranslationPair("HMOUNG", "hmn"), + new TranslationPair("HUNGARIAN", "hu"), + new TranslationPair("ICELANDIC", "is"), + new TranslationPair("IGBO", "ig"), + new TranslationPair("INDONESIAN", "id"), + new TranslationPair("IRISH", "ga"), + new TranslationPair("ITALIAN", "it"), + new TranslationPair("JAPANESE", "ja"), + new TranslationPair("JAVANESE", "jw"), + new TranslationPair("KANNADA", "kn"), + new TranslationPair("KAZAKH", "kk"), + new TranslationPair("KHMER", "km"), + new TranslationPair("KOREAN", "ko"), + new TranslationPair("KURDISH", "ku"), + new TranslationPair("KYRGYZ", "ky"), + new TranslationPair("LAO", "lo"), + new TranslationPair("LATIN", "la"), + new TranslationPair("LATVIAN", "lv"), + new TranslationPair("LITHUANIAN", "lt"), + new TranslationPair("LUXEMBOURGISH", "lb"), + new TranslationPair("MACEDONIAN", "mk"), + new TranslationPair("MALAY", "ms"), + new TranslationPair("MALAGASY", "mg"), + new TranslationPair("MALAYALAM", "ml"), + new TranslationPair("MALTESE", "mt"), + new TranslationPair("MAORI", "mi"), + new TranslationPair("MARATHI", "mr"), + new TranslationPair("MONGOLIAN", "mn"), + new TranslationPair("MYANMAR", "my"), + new TranslationPair("NEPALI", "ne"), + new TranslationPair("NORWEGIAN", "no"), + new TranslationPair("PASHTO", "ps"), + new TranslationPair("PERSIAN", "fa"), + new TranslationPair("POLISH", "pl"), + new TranslationPair("PORTUGUESE", "pt"), + new TranslationPair("PUNJABI", "pa"), + new TranslationPair("ROMANIAN", "ro"), + new TranslationPair("ROMANJI", "romanji"), + new TranslationPair("RUSSIAN", "ru"), + new TranslationPair("SAMOAN", "sm"), + new TranslationPair("SCOTS GAELIC", "gd"), + new TranslationPair("SERBIAN", "sr"), + new TranslationPair("SESOTHO", "st"), + new TranslationPair("SHONA", "sn"), + new TranslationPair("SINDHI", "sd"), + new TranslationPair("SINHALA", "si"), + new TranslationPair("SLOVAK", "sk"), + new TranslationPair("SLOVENIAN", "sl"), + new TranslationPair("SOMALI", "so"), + new TranslationPair("SPANISH", "es"), + new TranslationPair("SUNDANESE", "su"), + new TranslationPair("SWAHILI", "sw"), + new TranslationPair("SWEDISH", "sv"), + new TranslationPair("TAJIK", "tg"), + new TranslationPair("TAMIL", "ta"), + new TranslationPair("TELUGU", "te"), + new TranslationPair("THAI", "th"), + new TranslationPair("TURKISH", "tr"), + new TranslationPair("UKRAINIAN", "uk"), + new TranslationPair("URDU", "ur"), + new TranslationPair("UZBEK", "uz"), + new TranslationPair("VIETNAMESE", "vi"), + new TranslationPair("WELSH", "cy"), + new TranslationPair("XHOSA", "xh"), + new TranslationPair("YIDDISH", "yi"), + new TranslationPair("YORUBA", "yo"), + new TranslationPair("ZULU", "zu"), + }; + } + + public string GetName() + { + return "Google translate (old)"; + } + + public string GetUrl() + { + return "https://translate.google.com/"; + } + + public List Translate(string sourceLanguage, string targetLanguage, List paragraphs, StringBuilder log) + { + string result; + var input = new StringBuilder(); + var formattings = new Formatting[paragraphs.Count]; + for (var index = 0; index < paragraphs.Count; index++) + { + var p = paragraphs[index]; + var f = new Formatting(); + formattings[index] = f; + if (input.Length > 0) + { + input.Append(" " + SplitChar + " "); + } + var text = f.SetTagsAndReturnTrimmed(TranslationHelper.PreTranslate(p.Replace(SplitChar.ToString(), string.Empty), sourceLanguage), sourceLanguage); + text = f.Unbreak(text, p); + input.Append(text); + } + + using (var wc = new WebClient()) + { + string url = $"https://translate.googleapis.com/translate_a/single?client=gtx&sl={sourceLanguage}&tl={targetLanguage}&dt=t&q={Uri.EscapeDataString(input.ToString())}"; + wc.Encoding = Encoding.UTF8; + wc.Headers.Add("user-agent", "Mozilla/5.0 (Windows NT 6.1) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/41.0.2228.0 Safari/537.36"); + result = wc.DownloadString(url).Trim(); + } + + var sbAll = new StringBuilder(); + int count = 0; + int i = 1; + int level = result.StartsWith('[') ? 1 : 0; + while (i < result.Length - 1) + { + var sb = new StringBuilder(); + var start = false; + for (; i < result.Length - 1; i++) + { + var c = result[i]; + if (start) + { + if (c == '"' && result[i - 1] != '\\') + { + count++; + if (count % 2 == 1 && level > 2) // even numbers are original text, level > 3 is translation + sbAll.Append(" " + sb); + i++; + break; + } + sb.Append(c); + } + else if (c == '"') + { + start = true; + } + else if (c == '[') + { + level++; + } + else if (c == ']') + { + level--; + } + } + } + + var res = sbAll.ToString().Trim(); + res = Regex.Unescape(res); + var lines = res.SplitToLines().ToList(); + var resultList = new List(); + for (var index = 0; index < lines.Count; index++) + { + var line = lines[index]; + var s = Json.DecodeJsonText(line); + s = string.Join(Environment.NewLine, s.SplitToLines()); + s = TranslationHelper.PostTranslate(s, targetLanguage); + s = s.Replace(Environment.NewLine + Environment.NewLine, Environment.NewLine); + s = s.Replace(Environment.NewLine + " ", Environment.NewLine); + s = s.Replace(Environment.NewLine + " ", Environment.NewLine); + s = s.Replace(" " + Environment.NewLine, Environment.NewLine); + s = s.Replace(" " + Environment.NewLine, Environment.NewLine).Trim(); + if (formattings.Length > index) + { + s = formattings[index].ReAddFormatting(s); + s = formattings[index].Rebreak(s); + } + + resultList.Add(s); + } + + if (resultList.Count > paragraphs.Count) + { + var timmedList = resultList.Where(p => !string.IsNullOrEmpty(p)).ToList(); + if (timmedList.Count == paragraphs.Count) + return timmedList; + } + + if (resultList.Count < paragraphs.Count) + { + var splitList = SplitMergedLines(resultList, paragraphs); + if (splitList.Count == paragraphs.Count) + return splitList; + } + + return resultList; + } + + private static List SplitMergedLines(List input, List paragraphs) + { + var hits = 0; + var results = new List(); + for (var index = 0; index < input.Count; index++) + { + var line = input[index]; + var text = paragraphs[index]; + var badPoints = 0; + if (text.StartsWith("[") && !line.StartsWith("[")) + badPoints++; + if (text.StartsWith("-") && !line.StartsWith("-")) + badPoints++; + if (text.Length > 0 && char.IsUpper(text[0]) && line.Length > 0 && !char.IsUpper(line[0])) + badPoints++; + if (text.EndsWith(".") && !line.EndsWith(".")) + badPoints++; + if (text.EndsWith("!") && !line.EndsWith("!")) + badPoints++; + if (text.EndsWith("?") && !line.EndsWith("?")) + badPoints++; + if (text.EndsWith(",") && !line.EndsWith(",")) + badPoints++; + if (text.EndsWith(":") && !line.EndsWith(":")) + badPoints++; + var added = false; + if (badPoints > 0 && hits + input.Count < paragraphs.Count) + { + var percent = line.Length * 100.0 / text.Length; + if (percent > 150) + { + var temp = Utilities.AutoBreakLine(line).SplitToLines(); + if (temp.Length == 2) + { + hits++; + results.Add(temp[0]); + results.Add(temp[1]); + added = true; + } + } + } + if (!added) + { + results.Add(line); + } + } + + if (results.Count == paragraphs.Count) + return results; + + return input; + } + + } +} diff --git a/src/XmlContentTranslator/Translator/HtmlUtil.cs b/src/XmlContentTranslator/Translator/HtmlUtil.cs new file mode 100644 index 0000000..2aea006 --- /dev/null +++ b/src/XmlContentTranslator/Translator/HtmlUtil.cs @@ -0,0 +1,723 @@ +using System; +using System.Linq; +using System.Text; +using System.Text.RegularExpressions; + +namespace XmlContentTranslator.Translator +{ + /// + /// HTML specific string manipulations. + /// + public static class HtmlUtil + { + public const string TagItalic = "i"; + public const string TagBold = "b"; + public const string TagUnderline = "u"; + public const string TagParagraph = "p"; + public const string TagFont = "font"; + public const string TagCyrillicI = "\u0456"; // Cyrillic Small Letter Byelorussian-Ukrainian i (http://graphemica.com/%D1%96) + + private static readonly Regex TagOpenRegex = new Regex(@"<\s*(?:/\s*)?(\w+)[^>]*>", RegexOptions.Compiled); + + /// + /// Remove all of the specified opening and closing tags from the source HTML string. + /// + /// The source string to search for specified HTML tags. + /// The HTML tags to remove. + /// A new string without the specified opening and closing tags. + public static string RemoveOpenCloseTags(string source, params string[] tags) + { + // This pattern matches these tag formats: + // + // < tag*> + // + // < /tag*> + // + // < / tag*> + return TagOpenRegex.Replace( + source, + m => tags.Contains(m.Groups[1].Value, StringComparer.OrdinalIgnoreCase) ? string.Empty : m.Value); + } + + /// + /// Converts a string to an HTML-encoded string using named character references. + /// + /// The string to encode. + /// An encoded string. + public static string EncodeNamed(string source) + { + if (string.IsNullOrEmpty(source)) + return string.Empty; + + var encoded = new StringBuilder(source.Length); + foreach (var ch in source) + { + switch (ch) + { + case '<': + encoded.Append("<"); + break; + case '>': + encoded.Append(">"); + break; + case '"': + encoded.Append("""); + break; + case '&': + encoded.Append("&"); + break; + case '\'': + encoded.Append("'"); + break; + case ' ': + encoded.Append(" "); + break; + case '–': + encoded.Append("–"); + break; + case '—': + encoded.Append("—"); + break; + case '¡': + encoded.Append("¡"); + break; + case '¿': + encoded.Append("¿"); + break; + case '“': + encoded.Append("“"); + break; + case '”': + encoded.Append("”"); + break; + case '‘': + encoded.Append("‘"); + break; + case '’': + encoded.Append("’"); + break; + case '«': + encoded.Append("«"); + break; + case '»': + encoded.Append("»"); + break; + case '¢': + encoded.Append("¢"); + break; + case '©': + encoded.Append("©"); + break; + case '÷': + encoded.Append("÷"); + break; + case 'µ': + encoded.Append("µ"); + break; + case '·': + encoded.Append("·"); + break; + case '¶': + encoded.Append("¶"); + break; + case '±': + encoded.Append("±"); + break; + case '€': + encoded.Append("€"); + break; + case '£': + encoded.Append("£"); + break; + case '®': + encoded.Append("®"); + break; + case '§': + encoded.Append("§"); + break; + case '™': + encoded.Append("™"); + break; + case '¥': + encoded.Append("¥"); + break; + case 'á': + encoded.Append("á"); + break; + case 'Á': + encoded.Append("Á"); + break; + case 'à': + encoded.Append("à"); + break; + case 'À': + encoded.Append("À"); + break; + case 'â': + encoded.Append("â"); + break; + case 'Â': + encoded.Append("Â"); + break; + case 'å': + encoded.Append("å"); + break; + case 'Å': + encoded.Append("Å"); + break; + case 'ã': + encoded.Append("ã"); + break; + case 'Ã': + encoded.Append("Ã"); + break; + case 'ä': + encoded.Append("ä"); + break; + case 'Ä': + encoded.Append("Ä"); + break; + case 'æ': + encoded.Append("æ"); + break; + case 'Æ': + encoded.Append("Æ"); + break; + case 'ç': + encoded.Append("ç"); + break; + case 'Ç': + encoded.Append("Ç"); + break; + case 'é': + encoded.Append("é"); + break; + case 'É': + encoded.Append("É"); + break; + case 'è': + encoded.Append("è"); + break; + case 'È': + encoded.Append("È"); + break; + case 'ê': + encoded.Append("ê"); + break; + case 'Ê': + encoded.Append("Ê"); + break; + case 'ë': + encoded.Append("ë"); + break; + case 'Ë': + encoded.Append("Ë"); + break; + case 'í': + encoded.Append("í"); + break; + case 'Í': + encoded.Append("Í"); + break; + case 'ì': + encoded.Append("ì"); + break; + case 'Ì': + encoded.Append("Ì"); + break; + case 'î': + encoded.Append("î"); + break; + case 'Î': + encoded.Append("Î"); + break; + case 'ï': + encoded.Append("ï"); + break; + case 'Ï': + encoded.Append("Ï"); + break; + case 'ñ': + encoded.Append("ñ"); + break; + case 'Ñ': + encoded.Append("Ñ"); + break; + case 'ó': + encoded.Append("ó"); + break; + case 'Ó': + encoded.Append("Ó"); + break; + case 'ò': + encoded.Append("ò"); + break; + case 'Ò': + encoded.Append("Ò"); + break; + case 'ô': + encoded.Append("ô"); + break; + case 'Ô': + encoded.Append("Ô"); + break; + case 'ø': + encoded.Append("ø"); + break; + case 'Ø': + encoded.Append("Ø"); + break; + case 'õ': + encoded.Append("õ"); + break; + case 'Õ': + encoded.Append("Õ"); + break; + case 'ö': + encoded.Append("ö"); + break; + case 'Ö': + encoded.Append("Ö"); + break; + case 'ß': + encoded.Append("ß"); + break; + case 'ú': + encoded.Append("ú"); + break; + case 'Ú': + encoded.Append("Ú"); + break; + case 'ù': + encoded.Append("ù"); + break; + case 'Ù': + encoded.Append("Ù"); + break; + case 'û': + encoded.Append("û"); + break; + case 'Û': + encoded.Append("Û"); + break; + case 'ü': + encoded.Append("ü"); + break; + case 'Ü': + encoded.Append("Ü"); + break; + case 'ÿ': + encoded.Append("ÿ"); + break; + default: + if (ch > 127) + encoded.Append("&#" + (int)ch + ";"); + else + encoded.Append(ch); + break; + } + } + return encoded.ToString(); + } + + /// + /// Converts a string to an HTML-encoded string using numeric character references. + /// + /// The string to encode. + /// An encoded string. + public static string EncodeNumeric(string source) + { + if (string.IsNullOrEmpty(source)) + return string.Empty; + + var encoded = new StringBuilder(source.Length); + foreach (var ch in source) + { + if (ch == ' ') + { + encoded.Append("&#"); + encoded.Append(160); //   + encoded.Append(';'); + } + else if (ch > 127 || ch == '<' || ch == '>' || ch == '"' || ch == '&' || ch == '\'') + { + encoded.Append("&#"); + encoded.Append((int)ch); + encoded.Append(';'); + } + else + { + encoded.Append(ch); + } + } + return encoded.ToString(); + } + + public static string RemoveHtmlTags(string s, bool alsoSsaTags = false) + { + if (s == null || s.Length < 3) + return s; + + if (alsoSsaTags) + s = Utilities.RemoveSsaTags(s); + + if (!Utilities.Contains(s, '<')) + return s; + + if (s.Contains("< ")) + s = FixInvalidItalicTags(s); + + return RemoveOpenCloseTags(s, TagItalic, TagBold, TagUnderline, TagParagraph, TagFont, TagCyrillicI); + } + + public static bool IsUrl(string text) + { + if (string.IsNullOrWhiteSpace(text) || text.Length < 6 || !Utilities.Contains(text, '.') || Utilities.Contains(text, ' ')) + return false; + + var allLower = text.ToLower(); + if (allLower.StartsWith("http://", StringComparison.Ordinal) || allLower.StartsWith("https://", StringComparison.Ordinal) || + allLower.StartsWith("www.", StringComparison.Ordinal) || allLower.EndsWith(".org", StringComparison.Ordinal) || + allLower.EndsWith(".com", StringComparison.Ordinal) || allLower.EndsWith(".net", StringComparison.Ordinal)) + return true; + + if (allLower.Contains(".org/") || allLower.Contains(".com/") || allLower.Contains(".net/")) + return true; + + return false; + } + + public static bool StartsWithUrl(string text) + { + if (string.IsNullOrWhiteSpace(text)) + return false; + + var arr = text.Trim().TrimEnd('.').TrimEnd().Split(); + if (arr.Length == 0) + return false; + + return IsUrl(arr[0]); + } + + private static readonly string[] UppercaseTags = { "", "", "", "", "", "", "" }; + + public static string FixUpperTags(string text) + { + if (string.IsNullOrEmpty(text) || !Utilities.Contains(text, '<')) + return text; + var idx = text.IndexOfAny(UppercaseTags, StringComparison.Ordinal); + while (idx >= 0) + { + var endIdx = text.IndexOf('>', idx + 2); + if (endIdx < idx) + break; + var tag = text.Substring(idx, endIdx - idx).ToLowerInvariant(); + text = text.Remove(idx, endIdx - idx).Insert(idx, tag); + idx = text.IndexOfAny(UppercaseTags, StringComparison.Ordinal); + } + return text; + } + + public static string FixInvalidItalicTags(string text) + { + const string beginTag = ""; + const string endTag = ""; + + text = text.Replace("< i >", beginTag); + text = text.Replace("< i>", beginTag); + text = text.Replace("", beginTag); + text = text.Replace("", beginTag); + text = text.Replace("< I >", beginTag); + text = text.Replace("< I>", beginTag); + text = text.Replace("", beginTag); + + text = text.Replace("< / i >", endTag); + text = text.Replace("< /i>", endTag); + text = text.Replace("", endTag); + text = text.Replace("< /i >", endTag); + text = text.Replace("", endTag); + text = text.Replace("", endTag); + text = text.Replace("< / i>", endTag); + text = text.Replace("", endTag); + text = text.Replace("< / I >", endTag); + text = text.Replace("< /I>", endTag); + text = text.Replace("", endTag); + text = text.Replace("< /I >", endTag); + text = text.Replace("", endTag); + text = text.Replace("", endTag); + text = text.Replace("< / I>", endTag); + + text = text.Replace(" ", "_@_"); + text = text.Replace(" _@_", "_@_"); + text = text.Replace(" _@_ ", "_@_"); + text = text.Replace("_@_", " "); + + if (text.Contains(beginTag)) + { + text = text.Replace("", endTag); + text = text.Replace("", endTag); + } + else + { + text = text.Replace("", string.Empty); + text = text.Replace("", string.Empty); + } + + text = text.Replace(beginTag + beginTag, beginTag); + text = text.Replace(endTag + endTag, endTag); + + int italicBeginTagCount = Utilities.CountTagInText(text, beginTag); + int italicEndTagCount = Utilities.CountTagInText(text, endTag); + int noOfLines = Utilities.GetNumberOfLines(text); + if (italicBeginTagCount + italicEndTagCount > 0) + { + if (italicBeginTagCount == 1 && italicEndTagCount == 1 && text.IndexOf(beginTag, StringComparison.Ordinal) > text.IndexOf(endTag, StringComparison.Ordinal)) + { + const string pattern = "___________@"; + text = text.Replace(beginTag, pattern); + text = text.Replace(endTag, beginTag); + text = text.Replace(pattern, endTag); + } + + if (italicBeginTagCount == 2 && italicEndTagCount == 0) + { + int firstIndex = text.IndexOf(beginTag, StringComparison.Ordinal); + int lastIndex = text.LastIndexOf(beginTag, StringComparison.Ordinal); + int lastIndexWithNewLine = text.LastIndexOf(Environment.NewLine + beginTag, StringComparison.Ordinal) + Environment.NewLine.Length; + if (noOfLines == 2 && lastIndex == lastIndexWithNewLine && firstIndex < 2) + text = text.Replace(Environment.NewLine, endTag + Environment.NewLine) + endTag; + else + text = text.Remove(lastIndex, beginTag.Length).Insert(lastIndex, endTag); + } + + if (italicBeginTagCount == 1 && italicEndTagCount == 2) + { + int firstIndex = text.IndexOf(endTag, StringComparison.Ordinal); + if (text.StartsWith("--", StringComparison.Ordinal) || + text.StartsWith("- -", StringComparison.Ordinal) || + text.StartsWith("- -", StringComparison.Ordinal) || + text.StartsWith("- -", StringComparison.Ordinal)) + text = text.Remove(0, 5); + else if (firstIndex == 0) + text = text.Remove(0, 4); + else + text = text.Substring(0, firstIndex) + text.Substring(firstIndex + endTag.Length); + } + + if (italicBeginTagCount == 2 && italicEndTagCount == 1) + { + var lines = text.SplitToLines(); + if (lines.Length == 2 && lines[0].StartsWith(beginTag, StringComparison.Ordinal) && lines[0].EndsWith(endTag, StringComparison.Ordinal) && + lines[1].StartsWith(beginTag, StringComparison.Ordinal)) + { + text = text.TrimEnd() + endTag; + } + else + { + int lastIndex = text.LastIndexOf(beginTag, StringComparison.Ordinal); + if (text.Length > lastIndex + endTag.Length) + text = text.Substring(0, lastIndex) + text.Substring(lastIndex - 1 + endTag.Length); + else + text = text.Substring(0, lastIndex - 1) + endTag; + } + if (text.StartsWith(beginTag, StringComparison.Ordinal) && text.EndsWith(endTag, StringComparison.Ordinal) && text.Contains(endTag + Environment.NewLine + beginTag)) + { + text = text.Replace(endTag + Environment.NewLine + beginTag, Environment.NewLine); + } + } + + if (italicBeginTagCount == 1 && italicEndTagCount == 0) + { + int lastIndexWithNewLine = text.LastIndexOf(Environment.NewLine + beginTag, StringComparison.Ordinal) + Environment.NewLine.Length; + int lastIndex = text.LastIndexOf(beginTag, StringComparison.Ordinal); + + if (text.StartsWith(beginTag, StringComparison.Ordinal)) + text += endTag; + else if (noOfLines == 2 && lastIndex == lastIndexWithNewLine) + text += endTag; + else + text = text.Replace(beginTag, string.Empty); + } + + if (italicBeginTagCount == 0 && italicEndTagCount == 1) + { + var cleanText = RemoveOpenCloseTags(text, TagItalic, TagBold, TagUnderline, TagCyrillicI); + bool isFixed = false; + + // Foo. + if (text.EndsWith(endTag, StringComparison.Ordinal) && !cleanText.StartsWith("-") && !cleanText.Contains(Environment.NewLine + "-")) + { + text = beginTag + text; + isFixed = true; + } + + // - Foo | - Foo. + // - Bar. | - Foo. + if (!isFixed && Utilities.GetNumberOfLines(cleanText) == 2) + { + int newLineIndex = text.IndexOf(Environment.NewLine, StringComparison.Ordinal); + if (newLineIndex > 0) + { + var firstLine = text.Substring(0, newLineIndex).Trim(); + var secondLine = text.Substring(newLineIndex + 2).Trim(); + if (firstLine.EndsWith(endTag, StringComparison.Ordinal)) + { + firstLine = beginTag + firstLine; + isFixed = true; + } + if (secondLine.EndsWith(endTag, StringComparison.Ordinal)) + { + secondLine = beginTag + secondLine; + isFixed = true; + } + text = firstLine + Environment.NewLine + secondLine; + } + } + if (!isFixed) + text = text.Replace(endTag, string.Empty); + } + + // - foo. + // - bar. + if (italicBeginTagCount == 0 && italicEndTagCount == 2 && text.Contains(endTag + Environment.NewLine, StringComparison.Ordinal) && text.EndsWith(endTag, StringComparison.Ordinal)) + { + text = text.Replace(endTag, string.Empty); + text = beginTag + text + endTag; + } + + if (italicBeginTagCount == 0 && italicEndTagCount == 2 && text.StartsWith(endTag, StringComparison.Ordinal) && text.EndsWith(endTag, StringComparison.Ordinal)) + { + int firstIndex = text.IndexOf(endTag, StringComparison.Ordinal); + text = text.Remove(firstIndex, endTag.Length).Insert(firstIndex, beginTag); + } + + // Foo + // Bar + if (italicBeginTagCount == 2 && italicEndTagCount == 2 && noOfLines == 2) + { + int index = text.IndexOf(Environment.NewLine, StringComparison.Ordinal); + if (index > 0 && text.Length > index + (beginTag.Length + endTag.Length)) + { + var firstLine = text.Substring(0, index).Trim(); + var secondLine = text.Substring(index + Environment.NewLine.Length).Trim(); + + if (firstLine.Length > 10 && firstLine.StartsWith("- ", StringComparison.Ordinal) && firstLine.EndsWith(endTag, StringComparison.Ordinal)) + { + text = "- " + firstLine.Remove(0, 5) + Environment.NewLine + secondLine; + text = text.Replace("- ", "- "); + index = text.IndexOf(Environment.NewLine, StringComparison.Ordinal); + firstLine = text.Substring(0, index).Trim(); + secondLine = text.Substring(index + Environment.NewLine.Length).Trim(); + } + if (secondLine.Length > 10 && secondLine.StartsWith("- ", StringComparison.Ordinal) && secondLine.EndsWith(endTag, StringComparison.Ordinal)) + { + text = firstLine + Environment.NewLine + "- " + secondLine.Remove(0, 5); + text = text.Replace("- ", "- "); + index = text.IndexOf(Environment.NewLine, StringComparison.Ordinal); + firstLine = text.Substring(0, index).Trim(); + secondLine = text.Substring(index + Environment.NewLine.Length).Trim(); + } + + if (Utilities.StartsAndEndsWithTag(firstLine, beginTag, endTag) && Utilities.StartsAndEndsWithTag(secondLine, beginTag, endTag)) + { + text = text.Replace(beginTag, string.Empty).Replace(endTag, string.Empty).Trim(); + text = beginTag + text + endTag; + } + } + + //FALCONE: I didn't think
it was going to be you, + var colIdx = text.IndexOf(':'); + if (colIdx >= 0 && Utilities.CountTagInText(text, beginTag) + Utilities.CountTagInText(text, endTag) == 4 && text.Length > colIdx + 1 && !char.IsDigit(text[colIdx + 1])) + { + var firstLine = text.Substring(0, index); + var secondLine = text.Substring(index).TrimStart(); + + var secIdxCol = secondLine.IndexOf(':'); + if (secIdxCol < 0 || !Utilities.IsBetweenNumbers(secondLine, secIdxCol)) + { + var idx = firstLine.IndexOf(':'); + if (idx > 1) + { + var pre = text.Substring(0, idx + 1).TrimStart(); + text = text.Remove(0, idx + 1); + text = FixInvalidItalicTags(text).Trim(); + if (text.StartsWith(" ", StringComparison.OrdinalIgnoreCase)) + text = Utilities.RemoveSpaceBeforeAfterTag(text, beginTag); + text = pre + " " + text; + } + } + } + } + + //- You think they're they gone? + //- That can't be. + if (italicBeginTagCount == 3 && italicEndTagCount == 1 && noOfLines == 2) + { + var newLineIdx = text.IndexOf(Environment.NewLine, StringComparison.Ordinal); + var firstLine = text.Substring(0, newLineIdx).Trim(); + var secondLine = text.Substring(newLineIdx).Trim(); + + if ((Utilities.StartsAndEndsWithTag(firstLine, beginTag, beginTag) && Utilities.StartsAndEndsWithTag(secondLine, beginTag, endTag)) || + (Utilities.StartsAndEndsWithTag(secondLine, beginTag, beginTag) && Utilities.StartsAndEndsWithTag(firstLine, beginTag, endTag))) + { + text = text.Replace(beginTag, string.Empty); + text = text.Replace(endTag, string.Empty); + text = text.Replace(" ", " ").Trim(); + text = beginTag + text + endTag; + } + } + + if (noOfLines == 3) + { + var lines = text.SplitToLines(); + if (italicBeginTagCount == 3 && italicEndTagCount == 2 || italicBeginTagCount == 2 && italicEndTagCount == 3) + { + int numberOfItalics = 0; + foreach (var line in lines) + { + if (line.StartsWith(beginTag, StringComparison.Ordinal)) + numberOfItalics++; + if (line.EndsWith(endTag, StringComparison.Ordinal)) + numberOfItalics++; + } + if (numberOfItalics == 5) + { // fix missing tag + text = "" + text.Replace("", string.Empty).Replace("", string.Empty) + ""; + } + } + } + + text = text.Replace("", string.Empty); + text = text.Replace(" ", string.Empty); + text = text.Replace(" ", string.Empty); + } + return text; + } + + public static string ToggleTag(string text, string tag) + { + if (text.IndexOf("<" + tag + ">", StringComparison.OrdinalIgnoreCase) >= 0 || + text.IndexOf("", StringComparison.OrdinalIgnoreCase) >= 0) + { + text = text.Replace("<" + tag + ">", string.Empty); + text = text.Replace("", string.Empty); + text = text.Replace("<" + tag.ToUpper() + ">", string.Empty); + text = text.Replace("", string.Empty); + } + else + { + int indexOfEndBracket = text.IndexOf('}'); + if (text.StartsWith("{\\", StringComparison.Ordinal) && indexOfEndBracket > 1 && indexOfEndBracket < 6) + { + text = $"{text.Substring(0, indexOfEndBracket + 1)}<{tag}>{text.Remove(0, indexOfEndBracket + 1)}"; + } + else + { + text = $"<{tag}>{text}"; + } + } + return text; + } + + } +} diff --git a/src/XmlContentTranslator/Translator/ITranslator.cs b/src/XmlContentTranslator/Translator/ITranslator.cs new file mode 100644 index 0000000..5456157 --- /dev/null +++ b/src/XmlContentTranslator/Translator/ITranslator.cs @@ -0,0 +1,13 @@ +using System.Collections.Generic; +using System.Text; + +namespace XmlContentTranslator.Translator +{ + public interface ITranslator + { + List GetTranslationPairs(); + string GetName(); + string GetUrl(); + List Translate(string sourceLanguage, string targetLanguage, List paragraphs, StringBuilder log); + } +} diff --git a/src/XmlContentTranslator/Translator/Json.cs b/src/XmlContentTranslator/Translator/Json.cs new file mode 100644 index 0000000..4459c60 --- /dev/null +++ b/src/XmlContentTranslator/Translator/Json.cs @@ -0,0 +1,54 @@ +using System; +using System.Text; + +namespace XmlContentTranslator.Translator +{ + public class Json + { + public static string EncodeJsonText(string text) + { + var sb = new StringBuilder(text.Length); + foreach (var c in text) + { + switch (c) + { + case '\\': + sb.Append("\\\\"); + break; + case '"': + sb.Append("\\\""); + break; + default: + sb.Append(c); + break; + } + } + return sb.ToString().Replace(Environment.NewLine, "
"); + } + + public static string DecodeJsonText(string text) + { + text = text.Replace("
", Environment.NewLine); + text = text.Replace("
", Environment.NewLine); + text = text.Replace("
", Environment.NewLine); + text = text.Replace("\\n", Environment.NewLine); + bool keepNext = false; + var sb = new StringBuilder(text.Length); + foreach (var c in text) + { + if (c == '\\' && !keepNext) + { + keepNext = true; + } + else + { + sb.Append(c); + keepNext = false; + } + } + return sb.ToString(); + } + + + } +} diff --git a/src/XmlContentTranslator/Translator/StringExtensions.cs b/src/XmlContentTranslator/Translator/StringExtensions.cs new file mode 100644 index 0000000..0a6eb84 --- /dev/null +++ b/src/XmlContentTranslator/Translator/StringExtensions.cs @@ -0,0 +1,274 @@ +using System; +using System.Globalization; +using System.Text; + +namespace XmlContentTranslator.Translator +{ + public static class StringExtensions + { + public static bool LineStartsWithHtmlTag(this string text, bool threeLengthTag, bool includeFont = false) + { + if (text == null || (!threeLengthTag && !includeFont)) + return false; + return StartsWithHtmlTag(text, threeLengthTag, includeFont); + } + + public static bool LineEndsWithHtmlTag(this string text, bool threeLengthTag, bool includeFont = false) + { + if (text == null) + return false; + + var len = text.Length; + if (len < 6 || text[len - 1] != '>') + return false; + + //
+ if (threeLengthTag && len > 3 && text[len - 4] == '<' && text[len - 3] == '/') + return true; + if (includeFont && len > 8 && text[len - 7] == '<' && text[len - 6] == '/') + return true; + return false; + } + + public static bool LineBreakStartsWithHtmlTag(this string text, bool threeLengthTag, bool includeFont = false) + { + if (text == null || (!threeLengthTag && !includeFont)) + return false; + var newLineIdx = text.IndexOf(Environment.NewLine, StringComparison.Ordinal); + if (newLineIdx < 0 || text.Length < newLineIdx + 5) + return false; + text = text.Substring(newLineIdx + 2); + return StartsWithHtmlTag(text, threeLengthTag, includeFont); + } + + private static bool StartsWithHtmlTag(string text, bool threeLengthTag, bool includeFont) + { + if (threeLengthTag && text.Length >= 3 && text[0] == '<' && text[2] == '>' && (text[1] == 'i' || text[1] == 'I' || text[1] == 'u' || text[1] == 'U' || text[1] == 'b' || text[1] == 'B')) + return true; + if (includeFont && text.Length > 5 && text.StartsWith("', 5) >= 5; // or + return false; + } + + public static bool StartsWith(this string s, char c) + { + return s.Length > 0 && s[0] == c; + } + + public static bool StartsWith(this StringBuilder sb, char c) + { + return sb.Length > 0 && sb[0] == c; + } + + public static bool EndsWith(this string s, char c) + { + return s.Length > 0 && s[s.Length - 1] == c; + } + + public static bool EndsWith(this StringBuilder sb, char c) + { + return sb.Length > 0 && sb[sb.Length - 1] == c; + } + + public static bool Contains(this string source, char value) + { + return source.IndexOf(value) >= 0; + } + + public static bool Contains(this string source, char[] value) + { + return source.IndexOfAny(value) >= 0; + } + + public static bool Contains(this string source, string value, StringComparison comparisonType) + { + return source.IndexOf(value, comparisonType) >= 0; + } + + public static string[] SplitToLines(this string source) + { + return source.Replace("\r\r\n", "\n").Replace("\r\n", "\n").Replace('\r', '\n').Replace('\u2028', '\n').Split('\n'); + } + + public static int CountWords(this string source) + { + return HtmlUtil.RemoveHtmlTags(source, true).Split(new[] { ' ', '\n', '\r' }, StringSplitOptions.RemoveEmptyEntries).Length; + } + + // http://www.codeproject.com/Articles/43726/Optimizing-string-operations-in-C + public static int FastIndexOf(this string source, string pattern) + { + if (pattern == null) throw new ArgumentNullException(); + if (pattern.Length == 0) return 0; + if (pattern.Length == 1) return source.IndexOf(pattern[0]); + int limit = source.Length - pattern.Length + 1; + if (limit < 1) return -1; + // Store the first 2 characters of "pattern" + char c0 = pattern[0]; + char c1 = pattern[1]; + // Find the first occurrence of the first character + int first = source.IndexOf(c0, 0, limit); + while (first != -1) + { + // Check if the following character is the same like + // the 2nd character of "pattern" + if (source[first + 1] != c1) + { + first = source.IndexOf(c0, ++first, limit - first); + continue; + } + // Check the rest of "pattern" (starting with the 3rd character) + bool found = true; + for (var j = 2; j < pattern.Length; j++) + if (source[first + j] != pattern[j]) + { + found = false; + break; + } + // If the whole word was found, return its index, otherwise try again + if (found) return first; + first = source.IndexOf(c0, ++first, limit - first); + } + return -1; + } + + public static int IndexOfAny(this string s, string[] words, StringComparison comparisonType) + { + if (words == null || string.IsNullOrEmpty(s)) + return -1; + for (int i = 0; i < words.Length; i++) + { + var idx = s.IndexOf(words[i], comparisonType); + if (idx >= 0) + return idx; + } + return -1; + } + + public static string FixExtraSpaces(this string s) + { + if (string.IsNullOrEmpty(s)) + return s; + int len = s.Length; + int k = -1; + for (int i = len - 1; i >= 0; i--) + { + char ch = s[i]; + if (k < 2) + { + if (ch == 0x20) + { + k = i + 1; + } + } + else if (ch != 0x20) + { + // Two or more white-spaces found! + if (k - (i + 1) > 1) + { + // Keep only one white-space. + s = s.Remove(i + 1, k - (i + 2)); + } + + // No white-space after/before line break. + if ((ch == '\n' || ch == '\r') && i + 1 < s.Length && s[i + 1] == 0x20) + { + s = s.Remove(i + 1, 1); + } + // Reset remove length. + k = -1; + } + if (ch == 0x20 && i + 1 < s.Length && (s[i + 1] == '\n' || s[i + 1] == '\r')) + { + s = s.Remove(i, 1); + } + } + return s; + } + + public static bool ContainsLetter(this string s) + { + if (s != null) + { + foreach (var index in StringInfo.ParseCombiningCharacters(s)) + { + var uc = CharUnicodeInfo.GetUnicodeCategory(s, index); + if (uc == UnicodeCategory.LowercaseLetter || uc == UnicodeCategory.UppercaseLetter || uc == UnicodeCategory.TitlecaseLetter || uc == UnicodeCategory.ModifierLetter || uc == UnicodeCategory.OtherLetter) + return true; + } + } + return false; + } + + public static string RemoveControlCharacters(this string s) + { + int max = s.Length; + var newStr = new char[max]; + int newIdx = 0; + for (int index = 0; index < max; index++) + { + var ch = s[index]; + if (!char.IsControl(ch)) + { + newStr[newIdx++] = ch; + } + } + return new string(newStr, 0, newIdx); + } + + public static string RemoveControlCharactersButWhiteSpace(this string s) + { + int max = s.Length; + var newStr = new char[max]; + int newIdx = 0; + for (int index = 0; index < max; index++) + { + var ch = s[index]; + if (!char.IsControl(ch) || ch == '\u000d' || ch == '\u000a' || ch == '\u0009') + { + newStr[newIdx++] = ch; + } + } + return new string(newStr, 0, newIdx); + } + + public static string CapitalizeFirstLetter(this string s, CultureInfo ci = null) + { + var si = new StringInfo(s); + if (ci == null) + ci = CultureInfo.CurrentCulture; + if (si.LengthInTextElements > 0) + s = si.SubstringByTextElements(0, 1).ToUpper(ci); + if (si.LengthInTextElements > 1) + s += si.SubstringByTextElements(1); + return s; + } + + public static string ToRtf(this string value) + { + return @"{\rtf1\ansi\ansicpg1252\deff0{\fonttbl\f0\fswiss Helvetica;}\f0\pard " + value.ToRtfPart() + @"\par" + Environment.NewLine + "}"; + } + + public static string ToRtfPart(this string value) + { + // special RTF chars + var backslashed = new StringBuilder(value); + backslashed.Replace(@"\", @"\\"); + backslashed.Replace(@"{", @"\{"); + backslashed.Replace(@"}", @"\}"); + backslashed.Replace(Environment.NewLine, @"\par" + Environment.NewLine); + + // convert string char by char + var sb = new StringBuilder(); + foreach (char character in backslashed.ToString()) + { + if (character <= 0x7f) + sb.Append(character); + else + sb.Append("\\u" + Convert.ToUInt32(character) + "?"); + } + return sb.ToString(); + } + + } +} diff --git a/src/XmlContentTranslator/Translator/TranslationHelper.cs b/src/XmlContentTranslator/Translator/TranslationHelper.cs new file mode 100644 index 0000000..3bdefcf --- /dev/null +++ b/src/XmlContentTranslator/Translator/TranslationHelper.cs @@ -0,0 +1,64 @@ +using System.Text.RegularExpressions; + +namespace XmlContentTranslator.Translator +{ + public static class TranslationHelper + { + public static string PostTranslate(string s, string target) + { + if (target == "da") + { + s = s.Replace("Jeg ved.", "Jeg ved det."); + s = s.Replace(", jeg ved.", ", jeg ved det."); + + s = s.Replace("Jeg er ked af.", "Jeg er ked af det."); + s = s.Replace(", jeg er ked af.", ", jeg er ked af det."); + + s = s.Replace("Come on.", "Kom nu."); + s = s.Replace(", come on.", ", kom nu."); + s = s.Replace("Come on,", "Kom nu,"); + + s = s.Replace("Hey ", "Hej "); + s = s.Replace("Hey,", "Hej,"); + + s = s.Replace(" gonna ", " ville "); + s = s.Replace("Gonna ", "Vil "); + + s = s.Replace("Ked af.", "Undskyld."); + } + return s; + } + + public static string PreTranslate(string s, string source) + { + if (source == "en") + { + s = Regex.Replace(s, @"\bI'm ", "I am "); + s = Regex.Replace(s, @"\bI've ", "I have "); + s = Regex.Replace(s, @"\bI'll ", "I will "); + // s = Regex.Replace(s, @"\bI'd ", "I would "); // had or would??? + s = Regex.Replace(s, @"\b(I|i)t's ", "$1t is "); + s = Regex.Replace(s, @"\b(Y|y)ou're ", "$1ou are "); + s = Regex.Replace(s, @"\b(Y|y)ou've ", "$1ou have "); + s = Regex.Replace(s, @"\b(Y|y)ou'll ", "$1ou will "); + // s = Regex.Replace(s, @"\b(Y|y)ou'd ", "$1ou would "); // had or would??? + s = Regex.Replace(s, @"\b(H|h)e's ", "$1e is "); + s = Regex.Replace(s, @"\b(S|s)he's ", "$1he is "); + s = Regex.Replace(s, @"\b(W|w)e're ", "$1e are "); + s = Regex.Replace(s, @"\bwon't ", "will not "); + s = Regex.Replace(s, @"\bdon't ", "do not "); + s = Regex.Replace(s, @"\bDon't ", "Do not "); + s = Regex.Replace(s, @"\b(W|w)e're ", "$1e are "); + s = Regex.Replace(s, @"\b(T|t)hey're ", "$1hey are "); + s = Regex.Replace(s, @"\b(W|w)ho's ", "$1ho is "); + s = Regex.Replace(s, @"\b(T|t)hat's ", "$1hat is "); + s = Regex.Replace(s, @"\b(W|w)hat's ", "$1hat is "); + s = Regex.Replace(s, @"\b(W|w)here's ", "$1here is "); + s = Regex.Replace(s, @"\b(W|w)ho's ", "$1ho is "); + s = Regex.Replace(s, @"\B'(C|c)ause ", "$1ecause "); // \b (word boundry) does not workig with ' + } + return s; + } + + } +} diff --git a/src/XmlContentTranslator/Translator/TranslationPair.cs b/src/XmlContentTranslator/Translator/TranslationPair.cs new file mode 100644 index 0000000..0284502 --- /dev/null +++ b/src/XmlContentTranslator/Translator/TranslationPair.cs @@ -0,0 +1,24 @@ +namespace XmlContentTranslator.Translator +{ + public class TranslationPair + { + public string Name { get; set; } + public string Code { get; set; } + + public TranslationPair() + { + + } + + public TranslationPair(string name, string code) + { + Name = name; + Code = code; + } + + public override string ToString() + { + return Name; + } + } +} diff --git a/src/XmlContentTranslator/Translator/Utilities.cs b/src/XmlContentTranslator/Translator/Utilities.cs new file mode 100644 index 0000000..004584c --- /dev/null +++ b/src/XmlContentTranslator/Translator/Utilities.cs @@ -0,0 +1,683 @@ +using System; +using System.Collections.Generic; +using System.Reflection; +using System.Text; +using System.Text.RegularExpressions; + +namespace XmlContentTranslator.Translator +{ + public static class Utilities + { + + public static readonly string UppercaseLetters = "ABCDEFGHIJKLMNOPQRSTUVWZYXÆØÃÅÄÖÉÈÁÂÀÇÊÍÓÔÕÚŁАБВГДЕЁЖЗИЙКЛМНОПРСТУФХЦЧШЩЪЫЬЭЮЯĞİŞÜÙÁÌÑÎ"; + public static readonly string LowercaseLetters = UppercaseLetters.ToLower(); + public static readonly string LowercaseLettersWithNumbers = LowercaseLetters + "0123456789"; + public static readonly string AllLetters = UppercaseLetters + LowercaseLetters; + public static readonly string AllLettersAndNumbers = UppercaseLetters + LowercaseLettersWithNumbers; + + //#region StringExtension + public static bool Contains(this string s, char c) + { + return s.Length > 0 && s.IndexOf(c) > 0; + } + //public static string[] SplitToLines(this string s) + //{ + // return s.Replace(Environment.NewLine, "\n").Replace('\r', '\n').Split('\n'); + //} + //#endregion + internal static string AssemblyVersion + { + get + { + return Assembly.GetExecutingAssembly().GetName().Version.ToString(); + } + } + + public static bool IsInteger(string s) + { + int i; + return int.TryParse(s, out i); + } + + public static string RemoveHtmlTags(string s, bool alsoSSA = false) + { + if (string.IsNullOrEmpty(s)) + return string.Empty; + + if (alsoSSA) + s = RemoveSsaTags(s); + + if (!Contains(s, '<')) + return s; + s = Regex.Replace(s, "(?i)", string.Empty); + while (s.Contains(" ")) s = s.Replace(" ", " "); + return RemoveHtmlFontTag(s).Trim(); + } + + public static string RemoveSsaTags(string s) + { + const string tag = "{\\"; + var idx = s.IndexOf(tag, StringComparison.Ordinal); + while (idx >= 0) + { + var endIdx = s.IndexOf('}'); + if (endIdx < idx) + break; + s = s.Remove(idx, endIdx - idx + 1); + idx = s.IndexOf(tag, StringComparison.Ordinal); + } + return s; + } + + public static string RemoveHtmlFontTag(string s) + { + s = Regex.Replace(s, "(?i)", string.Empty); + var idx = s.IndexOf("= 0) + { + var endIdx = s.IndexOf('>', idx + 5); + if (endIdx < idx) break; + s = s.Remove(idx, endIdx - idx + 1); + idx = s.IndexOf("= 0) + { + count++; + index = index + tag.Length; + if (index >= text.Length) + return count; + index = text.IndexOf(tag, index, StringComparison.Ordinal); + } + return count; + } + + public static string FixExtraSpaces(this string s) + { + if (string.IsNullOrEmpty(s)) + return s; + int len = s.Length; + int k = -1; + for (int i = len - 1; i >= 0; i--) + { + char ch = s[i]; + if (k < 2) + { + if (ch == 0x20) + { + k = i + 1; + } + } + else if (ch != 0x20) + { + // Two or more white-spaces found! + if (k - (i + 1) > 1) + { + // Keep only one white-space. + s = s.Remove(i + 1, k - (i + 2)); + } + + // No white-space after/before line break. + if ((ch == '\n' || ch == '\r') && i + 1 < s.Length && s[i + 1] == 0x20) + { + s = s.Remove(i + 1, 1); + } + // Reset remove length. + k = -1; + } + if (ch == 0x20 && i + 1 < s.Length && (s[i + 1] == '\n' || s[i + 1] == '\r')) + { + s = s.Remove(i, 1); + } + } + return s; + } + + public static int GetNumberOfLines(string text) + { + if (string.IsNullOrEmpty(text)) + return 0; + + int lines = 1; + int idx = text.IndexOf('\n'); + while (idx >= 0) + { + lines++; + idx = text.IndexOf('\n', idx + 1); + } + return lines; + } + + public static bool StartsAndEndsWithTag(string text, string startTag, string endTag) + { + if (string.IsNullOrWhiteSpace(text)) + return false; + if (!text.Contains(startTag) || !text.Contains(endTag)) + return false; + + while (text.Contains(" ")) + text = text.Replace(" ", " "); + + var s1 = "- " + startTag; + var s2 = "-" + startTag; + var s3 = "- ..." + startTag; + var s4 = "- " + startTag + "..."; // - ... + + var e1 = endTag + "."; + var e2 = endTag + "!"; + var e3 = endTag + "?"; + var e4 = endTag + "..."; + var e5 = endTag + "-"; + + bool isStart = false; + bool isEnd = false; + if (text.StartsWith(startTag, StringComparison.Ordinal) || text.StartsWith(s1, StringComparison.Ordinal) || text.StartsWith(s2, StringComparison.Ordinal) || text.StartsWith(s3, StringComparison.Ordinal) || text.StartsWith(s4, StringComparison.Ordinal)) + isStart = true; + if (text.EndsWith(endTag, StringComparison.Ordinal) || text.EndsWith(e1, StringComparison.Ordinal) || text.EndsWith(e2, StringComparison.Ordinal) || text.EndsWith(e3, StringComparison.Ordinal) || text.EndsWith(e4, StringComparison.Ordinal) || text.EndsWith(e5, StringComparison.Ordinal)) + isEnd = true; + return isStart && isEnd; + } + + public static bool IsBetweenNumbers(string s, int position) + { + if (string.IsNullOrEmpty(s) || position < 1 || position + 2 > s.Length) + return false; + return char.IsDigit(s[position - 1]) && char.IsDigit(s[position + 1]); + } + + public static string RemoveSpaceBeforeAfterTag(string text, string openTag) + { + text = HtmlUtil.FixUpperTags(text); + var closeTag = string.Empty; + switch (openTag) + { + case "": + closeTag = ""; + break; + case "": + closeTag = ""; + break; + case "": + closeTag = ""; + break; + } + + if (closeTag.Length == 0 && openTag.Contains("
Foobar + if (text.StartsWith(open1, StringComparison.Ordinal)) + text = openTag + text.Substring(open1.Length); + + // e.g.: \r\n + if (text.StartsWith(open3, StringComparison.Ordinal)) + text = text.Remove(openTag.Length, Environment.NewLine.Length); + + // e.g.: \r\n + if (text.EndsWith(close5, StringComparison.Ordinal)) + text = text.Remove(text.Length - openTag.Length - Environment.NewLine.Length - 1, Environment.NewLine.Length); + + if (text.Contains(open2, StringComparison.Ordinal)) + text = text.Replace(open2, Environment.NewLine + openTag); + + // Hi bad man! -> Hi bad man! + text = text.Replace(" " + openTag + " ", " " + openTag); + text = text.Replace(Environment.NewLine + openTag + " ", Environment.NewLine + openTag); + + // Hi bad man! -> Hi bad man! + text = text.Replace(" " + closeTag + " ", closeTag + " "); + text = text.Replace(" " + closeTag + Environment.NewLine, closeTag + Environment.NewLine); + + text = text.Trim(); + if (text.StartsWith(open1, StringComparison.Ordinal)) + text = openTag + text.Substring(open1.Length); + + return text; + } + + + + public static string AutoBreakLine(string text, string language) + { + return AutoBreakLine(text, 43, 22, language); + } + + public static string AutoBreakLine(string text) + { + return AutoBreakLine(text, string.Empty); // no language + } + + private static bool CanBreak(string s, int index, string language) + { + char nextChar; + if (index >= 0 && index < s.Length) + nextChar = s[index]; + else + return false; + if (!Contains("\r\n\t ", nextChar)) + return false; + + // Some words we don't like breaking after + string s2 = s.Substring(0, index); + if (s2.EndsWith("? -", StringComparison.Ordinal) || s2.EndsWith("! -", StringComparison.Ordinal) || s2.EndsWith(". -", StringComparison.Ordinal)) + return false; + + return true; + } + + public static string AutoBreakLineMoreThanTwoLines(string text, int maximumLineLength, string language) + { + if (text == null || text.Length < 3) + return text; + + string s = AutoBreakLine(text, 0, 0, language); + + var arr = s.SplitToLines(); + if ((arr.Length < 2 && arr[0].Length <= maximumLineLength) || (arr[0].Length <= maximumLineLength && arr[1].Length <= maximumLineLength)) + return s; + + s = RemoveLineBreaks(s); + + var htmlTags = new Dictionary(); + var sb = new StringBuilder(s.Length); + int six = 0; + while (six < s.Length) + { + var letter = s[six]; + var tagFound = letter == '<' && (s.Substring(six).StartsWith("', six + 1); + + if (tagFound && endIndex > 0) + { + string tag = s.Substring(six, endIndex - six + 1); + s = s.Remove(six, tag.Length); + if (htmlTags.ContainsKey(six)) + htmlTags[six] = htmlTags[six] + tag; + else + htmlTags.Add(six, tag); + } + else + { + sb.Append(letter); + six++; + } + } + s = sb.ToString(); + + var words = s.Split(' '); + for (int numberOfLines = 3; numberOfLines < 9999; numberOfLines++) + { + int average = s.Length / numberOfLines + 1; + for (int len = average; len < maximumLineLength; len++) + { + List list = SplitToX(words, numberOfLines, len); + bool allOk = true; + foreach (var lineLength in list) + { + if (lineLength > maximumLineLength) + allOk = false; + } + if (allOk) + { + int index = 0; + foreach (var item in list) + { + index += item; + htmlTags.Add(index, Environment.NewLine); + } + s = ReInsertHtmlTags(s, htmlTags); + s = s.Replace(" " + Environment.NewLine, Environment.NewLine); + s = s.Replace(Environment.NewLine + " ", Environment.NewLine); + s = s.Replace(Environment.NewLine + "
", "" + Environment.NewLine); + s = s.Replace(Environment.NewLine + "", "" + Environment.NewLine); + s = s.Replace(Environment.NewLine + "", "" + Environment.NewLine); + s = s.Replace(Environment.NewLine + "", "
" + Environment.NewLine); + return s.TrimEnd(); + } + } + } + + return text; + } + + private static List SplitToX(string[] words, int count, int average) + { + var list = new List(); + int currentIdx = 0; + int currentCount = 0; + foreach (string word in words) + { + if (currentCount + word.Length + 3 > average && currentIdx < count) + { + list.Add(currentCount); + currentIdx++; + currentCount = 0; + } + currentCount += word.Length + 1; + } + if (currentIdx < count) + list.Add(currentCount); + else + list[list.Count - 1] += currentCount; + return list; + } + + public static string AutoBreakLine(string text, int maximumLength, int mergeLinesShorterThan, string language) + { + if (text == null || text.Length < 3) + return text; + + // do not autobreak dialogs + if (Contains(text, '-') && text.Contains(Environment.NewLine)) + { + var noTagLines = HtmlUtil.RemoveHtmlTags(text, true).SplitToLines(); + if (noTagLines.Length == 2) + { + var arr0 = noTagLines[0].Trim().TrimEnd('"', '\'').TrimEnd(); + if (arr0.StartsWith('-') && noTagLines[1].TrimStart().StartsWith('-') && arr0.Length > 1 && (Contains(".?!)]", arr0[arr0.Length - 1]) || arr0.EndsWith("--", StringComparison.Ordinal) || arr0.EndsWith('–'))) + return text; + } + } + + string s = RemoveLineBreaks(text); + if (HtmlUtil.RemoveHtmlTags(s, true).Length < mergeLinesShorterThan) + { + return s; + } + + var htmlTags = new Dictionary(); + var sb = new StringBuilder(); + int six = 0; + while (six < s.Length) + { + var letter = s[six]; + bool tagFound = false; + if (letter == '<') + { + string tagString = s.Substring(six); + tagFound = tagString.StartsWith("', six + 1); + + if (tagFound && endIndex > 0) + { + string tag = s.Substring(six, endIndex - six + 1); + s = s.Remove(six, tag.Length); + if (htmlTags.ContainsKey(six)) + htmlTags[six] = htmlTags[six] + tag; + else + htmlTags.Add(six, tag); + } + else + { + sb.Append(letter); + six++; + } + } + s = sb.ToString(); + + int splitPos = -1; + int mid = s.Length / 2; + + // try to find " - " with uppercase letter after (dialog) + if (s.Contains(" - ")) + { + for (int j = 0; j <= (maximumLength / 2) + 5; j++) + { + if (mid + j + 4 < s.Length) + { + if (s[mid + j] == '-' && s[mid + j + 1] == ' ' && s[mid + j - 1] == ' ') + { + string rest = s.Substring(mid + j + 1).TrimStart(); + if (rest.Length > 0 && char.IsUpper(rest[0])) + { + splitPos = mid + j; + break; + } + } + } + if (mid - (j + 1) > 4) + { + if (s[mid - j] == '-' && s[mid - j + 1] == ' ' && s[mid - j - 1] == ' ') + { + string rest = s.Substring(mid - j + 1).TrimStart(); + if (rest.Length > 0 && char.IsUpper(rest[0])) + { + if (mid - j > 5 && s[mid - j - 1] == ' ') + { + if (Contains("!?.", s[mid - j - 2])) + { + splitPos = mid - j; + break; + } + var first = s.Substring(0, mid - j - 1); + if (first.EndsWith(".\"", StringComparison.Ordinal) || first.EndsWith("!\"", StringComparison.Ordinal) || first.EndsWith("?\"", StringComparison.Ordinal)) + { + splitPos = mid - j; + break; + } + } + } + } + } + } + } + + if (splitPos == maximumLength + 1 && s[maximumLength] != ' ') // only allow space for last char (as it does not count) + splitPos = -1; + + if (splitPos < 0) + { + const string expectedChars1 = ".!?0123456789"; + const string expectedChars2 = ".!?"; + for (int j = 0; j < 15; j++) + { + if (mid + j + 1 < s.Length && mid + j > 0) + { + if (Contains(expectedChars2, s[mid + j]) && !IsPartOfNumber(s, mid + j) && CanBreak(s, mid + j + 1, language)) + { + splitPos = mid + j + 1; + if (Contains(expectedChars1, s[splitPos])) + { // do not break double/tripple end lines like "!!!" or "..." + splitPos++; + if (Contains(expectedChars1, s[mid + j + 1])) + splitPos++; + } + break; + } + if (Contains(expectedChars2, s[mid - j]) && !IsPartOfNumber(s, mid - j) && CanBreak(s, mid - j, language)) + { + splitPos = mid - j; + splitPos++; + break; + } + } + } + } + + if (splitPos > maximumLength) // too long first line + { + if (splitPos != maximumLength + 1 || s[maximumLength] != ' ') // allow for maxlength+1 char to be space (does not count) + splitPos = -1; + } + else if (splitPos >= 0 && s.Length - splitPos > maximumLength) // too long second line + { + splitPos = -1; + } + + if (splitPos < 0) + { + const string expectedChars1 = ".!?, "; + const string expectedChars2 = " .!?"; + const string expectedChars3 = ".!?"; + for (int j = 0; j < 25; j++) + { + if (mid + j + 1 < s.Length && mid + j > 0) + { + if (Contains(expectedChars1, s[mid + j]) && !IsPartOfNumber(s, mid + j) && s.Length > mid + j + 2 && CanBreak(s, mid + j, language)) + { + splitPos = mid + j; + if (Contains(expectedChars2, s[mid + j + 1])) + { + splitPos++; + if (Contains(expectedChars2, s[mid + j + 2])) + splitPos++; + } + break; + } + if (Contains(expectedChars1, s[mid - j]) && !IsPartOfNumber(s, mid - j) && s.Length > mid + j + 2 && CanBreak(s, mid - j, language)) + { + splitPos = mid - j; + if (Contains(expectedChars3, s[splitPos])) + splitPos--; + if (Contains(expectedChars3, s[splitPos])) + splitPos--; + if (Contains(expectedChars3, s[splitPos])) + splitPos--; + break; + } + } + } + } + + if (splitPos < 0) + { + splitPos = mid; + s = s.Insert(mid - 1, Environment.NewLine); + s = ReInsertHtmlTags(s, htmlTags); + htmlTags = new Dictionary(); + s = s.Replace(Environment.NewLine, "-"); + } + if (splitPos < s.Length - 2) + s = s.Substring(0, splitPos) + Environment.NewLine + s.Substring(splitPos); + + s = ReInsertHtmlTags(s, htmlTags); + var idx = s.IndexOf(Environment.NewLine + " 2) + { + var endIdx = s.IndexOf('>', idx + 2); + if (endIdx > idx) + { + var tag = s.Substring(idx + Environment.NewLine.Length, endIdx - (idx + Environment.NewLine.Length) + 1); + s = s.Insert(idx, tag); + s = s.Remove(idx + tag.Length + Environment.NewLine.Length, tag.Length); + } + } + s = s.Replace(" " + Environment.NewLine, Environment.NewLine); + s = s.Replace(Environment.NewLine + " ", Environment.NewLine); + return s.TrimEnd(); + } + + public static string RemoveLineBreaks(string s) + { + s = HtmlUtil.FixUpperTags(s); + s = s.Replace(Environment.NewLine + "", "" + Environment.NewLine); + s = s.Replace(Environment.NewLine + "", "" + Environment.NewLine); + s = s.Replace(Environment.NewLine + "", "" + Environment.NewLine); + s = s.Replace(Environment.NewLine + "
", "" + Environment.NewLine); + s = s.Replace(" " + Environment.NewLine + "", " "); + s = s.Replace("" + Environment.NewLine + " ", " "); + s = s.Replace("" + Environment.NewLine + "", " "); + s = s.Replace(Environment.NewLine, " "); + s = s.Replace(" ", " "); + s = s.Replace(" ", " "); + s = s.Replace(" ", " "); + s = s.Replace(" ", " "); + s = FixExtraSpaces(s); + return s.Trim(); + } + + private static string ReInsertHtmlTags(string s, Dictionary htmlTags) + { + if (htmlTags.Count > 0) + { + var sb = new StringBuilder(s.Length); + int six = 0; + foreach (var letter in s) + { + if (Contains(Environment.NewLine, letter)) + { + sb.Append(letter); + } + else + { + if (htmlTags.ContainsKey(six)) + { + sb.Append(htmlTags[six]); + } + sb.Append(letter); + six++; + } + } + if (htmlTags.ContainsKey(six)) + { + sb.Append(htmlTags[six]); + } + return sb.ToString(); + } + return s; + } + + private static bool IsPartOfNumber(string s, int position) + { + if (string.IsNullOrWhiteSpace(s) || position + 1 >= s.Length) + return false; + + if (position > 0 && Contains(@",.", s[position])) + { + return char.IsDigit(s[position - 1]) && char.IsDigit(s[position + 1]); + } + return false; + } + + } +} \ No newline at end of file diff --git a/src/XmlContentTranslator/XmlContentTranslator.csproj b/src/XmlContentTranslator/XmlContentTranslator.csproj index f4d0259..e1cc092 100644 --- a/src/XmlContentTranslator/XmlContentTranslator.csproj +++ b/src/XmlContentTranslator/XmlContentTranslator.csproj @@ -61,6 +61,16 @@ + + + + + + + + + + diff --git a/src/XmlContentTranslator/XmlContentTranslator.csproj.user b/src/XmlContentTranslator/XmlContentTranslator.csproj.user new file mode 100644 index 0000000..6cbe588 --- /dev/null +++ b/src/XmlContentTranslator/XmlContentTranslator.csproj.user @@ -0,0 +1,6 @@ + + + + ProjectFiles + + \ No newline at end of file