Support for languages using Diacritics as multi-codepoint joiners (#11806)

* Diacritics support - achieving 1-Char-Per-Glyph via a fake alphabet * Diacritics support - Redesign state engine and polishing * Diacritics support - Unit test * Diacritics support - Expand unit test to cover more cases * Diacritics support - Expand unit test to cover more cases * Clarify a function name * Change format of diacritic definitions * Refactor DiacriticSupport to per-language class with statics in Companion * Update DiacriticSupport to use CharCategory and enable support of surrogate pairs * Documentation
2025-07-04 15:27:50 +07:00 · 2024-06-29 22:39:46 +02:00
parent 0f2a697ba6
commit b5622df92d
10 changed files with 432 additions and 32 deletions
--- a/android/assets/jsons/translations/template.properties
+++ b/android/assets/jsons/translations/template.properties
@ -9,6 +9,15 @@
 # Don't translate these words to your language, only put 'true' or 'false'.
 StartWithCapitalLetter = 

+# Diacritics Support:
+# See https://yairm210.github.io/Unciv/Other/Translating/#diacritics-support for details!
+# Most languages will not need these, feel free to ignore, or use "" to avoid the "requires translation" mark. Do NOT translate the "key" to the left of the equals sign!
+diacritics_support = 
+unicode_block_start_character = 
+unicode_block_end_character = 
+left_joining_diacritics = 
+right_joining_diacritics = 
+left_and_right_joiners = 

 # Fastlane
 # These will be automatically copied to the fastlane descriptions used by F-Droid. Their keys are not as usual the english original, please read those directly as linked.
--- a/android/src/com/unciv/app/AndroidFont.kt
+++ b/android/src/com/unciv/app/AndroidFont.kt
@ -90,9 +90,9 @@ class AndroidFont : FontImplementation {
        return paint.textSize.toInt()
    }

-    override fun getCharPixmap(char: Char): Pixmap {
+    override fun getCharPixmap(symbolString: String): Pixmap {
        val metric = getMetrics()  // Use our interpretation instead of paint.fontMetrics because it fixes some bad metrics
-        var width = paint.measureText(char.toString()).toInt()
+        var width = paint.measureText(symbolString).toInt()
        var height = ceil(metric.height).toInt()
        if (width == 0) {
            height = getFontSize()
@ -101,7 +101,7 @@ class AndroidFont : FontImplementation {

        val bitmap = Bitmap.createBitmap(width, height, Bitmap.Config.ARGB_8888)
        val canvas = Canvas(bitmap)
-        canvas.drawText(char.toString(), 0f, metric.leading + metric.ascent + 1f, paint)
+        canvas.drawText(symbolString, 0f, metric.leading + metric.ascent + 1f, paint)

        val pixmap = Pixmap(width, height, Pixmap.Format.RGBA8888)
        val data = IntArray(width * height)
--- a/core/src/com/unciv/models/translations/Translations.kt
+++ b/core/src/com/unciv/models/translations/Translations.kt
@ -7,10 +7,12 @@ import com.unciv.models.ruleset.RulesetCache
 import com.unciv.models.ruleset.unique.Unique
 import com.unciv.models.stats.Stat
 import com.unciv.models.stats.Stats
+import com.unciv.ui.components.fonts.DiacriticSupport
 import com.unciv.ui.components.fonts.FontRulesetIcons
 import com.unciv.utils.Log
 import com.unciv.utils.debug
 import java.util.Locale
+import org.jetbrains.annotations.VisibleForTesting

 /**
 *  This collection holds all translations for the game.
@ -41,7 +43,6 @@ class Translations : LinkedHashMap<String, TranslationEntry>() {
    // used by tr() whenever GameInfo not initialized (allowing new game screen to use mod translations)
    var translationActiveMods = LinkedHashSet<String>()

-
    /**
     * Searches for the translation entry of a given [text] for a given [language].
     * This includes translations provided by mods from [activeMods]
@ -118,22 +119,27 @@ class Translations : LinkedHashMap<String, TranslationEntry>() {
        debug("Loading translation file for %s - %sms", language, System.currentTimeMillis() - translationStart)
    }

-    private fun createTranslations(language: String, languageTranslations: HashMap<String,String>) {
-        for (translation in languageTranslations) {
-            val hashKey = if (translation.key.contains('[') && !translation.key.contains('<'))
-                translation.key.getPlaceholderText()
-            else translation.key
+    @VisibleForTesting
+    fun createTranslations(language: String, languageTranslations: HashMap<String, String>) {
+        val diacriticSupport = DiacriticSupport(languageTranslations).takeIf { it.isEnabled() }
+        for ((key, value) in languageTranslations) {
+            val hashKey = if (key.contains('[') && !key.contains('<'))
+                key.getPlaceholderText()
+            else key
            var entry = this[hashKey]
            if (entry == null) {
-                entry = TranslationEntry(translation.key)
+                entry = TranslationEntry(key)
                this[hashKey] = entry
            }
-            entry[language] = translation.value
+            entry[language] = diacriticSupport?.remapDiacritics(value) ?: value
        }
    }

+
    fun tryReadTranslationForCurrentLanguage() {
+        DiacriticSupport.reset()
        tryReadTranslationForLanguage(UncivGame.Current.settings.language)
+        DiacriticSupport.freeTranslationData()
    }

    /** Get a list of supported languages for [readAllLanguagesTranslation] */
@ -174,9 +180,11 @@ class Translations : LinkedHashMap<String, TranslationEntry>() {

        val translationStart = System.currentTimeMillis()

+        DiacriticSupport.reset()
        for (language in getLanguagesWithTranslationFile()) {
            tryReadTranslationForLanguage(language)
        }
+        DiacriticSupport.freeTranslationData()

        debug("Loading translation files - %sms", System.currentTimeMillis() - translationStart)
    }
--- a/core/src/com/unciv/ui/components/fonts/DiacriticSupport.kt
+++ b/core/src/com/unciv/ui/components/fonts/DiacriticSupport.kt
@ -0,0 +1,251 @@
+package com.unciv.ui.components.fonts
+
+import com.unciv.utils.Log
+import org.jetbrains.annotations.VisibleForTesting
+
+/**
+ *  ## An engine to support languages with heavy diacritic usage through Gdx Scene2D
+ *
+ *  ### Concepts
+ *  - This is not needed for diacritics where Unicode already defines the combined glyphs as individual codepoints
+ *  - Gdx text rendering assumes one Char one Glyph (and left-to-right)
+ *  - The underlying OS **does** have the capability to render glyphs created by combining diacritic joiners with other characters (if not, this fails with ugly output but hopefully no exceptions).
+ *  - We'll deal with one glyph at a time arranges left to right, and expect a finite number of combination glyphs (all fit into the Unicode Private Use Area **together** with [FontRulesetIcons]).
+ *  - We'll recognize these combos in the translated texts at translation loading time and map each combo into a fake alphabet, which fulfills the "one Char one Glyph" tenet.
+ *  - Conversely, the loader will build a map of distinct combinations -codepoint sequences- that map into a single glyph and correlate each with their fake alphabet codepoint.
+ *  - At render time, only the map of fake alphabet codepoints to their original codepoint sequences is needed.
+ *        * Remember, NativeBitmapFontData renders and caches glyphs on demand
+ *        * GlyphLayout (a Gdx class) needs a Glyph that's not yet cached, then:
+ *        * If it's in the fake alphabet, we'll ask the OS to render the original codepoint sequence instead
+ *        * Otherwise render the single Char as before
+ *
+ *  ### Usage
+ *  - Call [reset] when translation loading starts over
+ *  - Instantiate [DiacriticSupport] through the constructor-like factory [invoke] once a translation file is read (their map of key (left of =) to translation (right of =) is in memory, pass that as argument)
+ *  - Check [isEnabled] - if false, the rest of that language load need not bother with diacritics
+ *  - Call [remapDiacritics] on each translation and store the result instead of the original value
+ *  - If you wish to save some memory, call [freeTranslationData] after all required languages are done
+ *  - Later, [NativeBitmapFontData.createAndCacheGlyph] will use [getStringFor] to map the fake alphabet back to codepoint sequences
+ *
+ *  ### Notes
+ *  - [FontRulesetIcons] initialize ***after*** Translation loading. If this ever changes, this might need some tweaking.
+ *  - The primary constructor is only used from the [Companion.invoke] factory and for testing.
+ */
+class DiacriticSupport(
+    private val enabled: Boolean = false,
+    range: CharRange,
+    leftDiacritics: String,
+    rightDiacritics: String,
+    joinerDiacritics: String
+) {
+    private object TranslationKeys {
+        const val enable = "diacritics_support"
+        const val rangeStart = "unicode_block_start_character"
+        const val rangeEnd = "unicode_block_end_character"
+        const val left = "left_joining_diacritics"
+        const val right = "right_joining_diacritics"
+        const val joiner = "left_and_right_joiners"
+    }
+
+    companion object {
+        /** Start at end of Unicode Private Use Area and go down from there: UShort is the preferred Char() constructor parameter! */
+        private const val startingReplacementCodepoint: UShort = 63743u // 0xF8FF
+        /** Use stdlib CharCategory to determine which codepoints represent combining diacritical marks.
+         *  We're defaulting all punctuation, currency & other symbols, and nonprinting to None,
+         *  meaning they won't combine even when followed by a diacritic. */
+        private val charCategoryToClass = mapOf(
+            CharCategory.UPPERCASE_LETTER to CharClass.Base,
+            CharCategory.LOWERCASE_LETTER to CharClass.Base,
+            CharCategory.TITLECASE_LETTER to CharClass.Base,
+            CharCategory.OTHER_LETTER to CharClass.Base,
+            CharCategory.MODIFIER_LETTER to CharClass.Base,
+            CharCategory.DECIMAL_DIGIT_NUMBER to CharClass.Base,
+            CharCategory.LETTER_NUMBER to CharClass.Base,
+            CharCategory.OTHER_NUMBER to CharClass.Base,
+            CharCategory.COMBINING_SPACING_MARK to CharClass.LeftJoiner,
+            CharCategory.NON_SPACING_MARK to CharClass.LeftJoiner,
+            CharCategory.ENCLOSING_MARK to CharClass.LeftJoiner,
+            CharCategory.SURROGATE to CharClass.Surrogate
+        )
+        private const val defaultRangeStart = '\u0021'
+        private const val defaultRangeEnd = '\uFFEE'
+
+        private var nextFreeDiacriticReplacementCodepoint = startingReplacementCodepoint
+        private val fakeAlphabet = mutableMapOf<Char, String>()
+        private val inverseMap = mutableMapOf<String, Char>()
+
+        /** Prepares this for a complete start-over, expecting a language load to instantiate a DiacriticSupport next */
+        fun reset() {
+            fakeAlphabet.clear()
+            freeTranslationData()
+            nextFreeDiacriticReplacementCodepoint = startingReplacementCodepoint
+        }
+
+        /** This is the main engine for rendering text glyphs after the translation loader has filled up this `object`
+         *  @param  char The real or "fake alphabet" char stored by [remapDiacritics] to render
+         *  @return The one to many (probably 8 max) codepoint string to be rendered into a single glyph by native font services
+         */
+        fun getStringFor(char: Char) = fakeAlphabet[char] ?: char.toString()
+
+        /** Call when use of [remapDiacritics] is finished to save some memory */
+        fun freeTranslationData() {
+            for ((length, examples) in inverseMap.keys.groupBy { it.length }.toSortedMap()) {
+                Log.debug("Length %d - example %s", length, examples.first())
+            }
+            inverseMap.clear()
+        }
+
+        /** Other "fake" alphabets can use Unicode Private Use Areas from U+E000 up to including... */
+        fun getCurrentFreeCode() = Char(nextFreeDiacriticReplacementCodepoint)
+
+        /** If this is true, no need to bother [remapping chars at render time][getStringFor] */
+        fun isEmpty() = fakeAlphabet.isEmpty()
+
+        /** Factory that gets the primary constructor parameters by extracting the translation entries for [TranslationKeys] */
+        operator fun invoke(translations: HashMap<String, String>): DiacriticSupport {
+            val stripCommentRegex = """^"?(.*?)"?(?:\s*#.*)?$""".toRegex()
+            fun String?.parseDiacriticEntry(): String {
+                if (isNullOrEmpty()) return ""
+                val tokens = stripCommentRegex.matchEntire(this)!!.groupValues[1].splitToSequence(' ').toMutableList()
+                for (index in tokens.indices) {
+                    val token = tokens[index]
+                    when {
+                        token.length == 1 -> continue
+                        token.startsWith("u+", true) -> tokens[index] = Char(token.drop(2).toInt(16)).toString()
+                        tokens.size == 1 -> continue
+                        else -> throw IllegalArgumentException("Invalid diacritic definition: \"$token\" is not a single character or unicode codepoint notation")
+                    }
+                }
+                return tokens.joinToString("")
+            }
+
+            val enable = translations[TranslationKeys.enable].parseDiacriticEntry() == "true"
+            val rangeStart = translations[TranslationKeys.rangeStart].parseDiacriticEntry()
+            val rangeEnd = translations[TranslationKeys.rangeEnd].parseDiacriticEntry()
+            val range = if (rangeStart.isEmpty() || rangeEnd.isEmpty()) CharRange.EMPTY
+                else rangeStart.first()..rangeEnd.first()
+            val leftDiacritics = translations[TranslationKeys.left].parseDiacriticEntry()
+            val rightDiacritics = translations[TranslationKeys.right].parseDiacriticEntry()
+            val joinerDiacritics = translations[TranslationKeys.joiner].parseDiacriticEntry()
+
+            return DiacriticSupport(enable, range, leftDiacritics, rightDiacritics, joinerDiacritics)
+        }
+    }
+
+    private val charClassMap = mutableMapOf<Char, CharClass>()
+
+    /** Holds all information to process a single translation line and replace diacritic combinations with fake alphabet codepoints */
+    private inner class LineData(capacity: Int) {
+        val output = StringBuilder(capacity)
+        val accumulator = StringBuilder(9) // touhidurrr said there can be nine
+        var waitingHighSurrogate = Char.MIN_VALUE
+
+        fun expectsJoin() = accumulator.isNotEmpty() && getCharClass(accumulator.last()).expectsRightJoin
+        fun flush() {
+            if (accumulator.length <= 1) output.append(accumulator)
+            else output.append(getReplacementChar(accumulator.toString()))
+            accumulator.clear()
+        }
+        fun forbidWaitingHighSurrogate() {
+            if (waitingHighSurrogate != Char.MIN_VALUE)
+                throw IllegalArgumentException("Invalid Unicode: High surrogate without low surrogate")
+        }
+        fun accumulate(char: Char) {
+            forbidWaitingHighSurrogate()
+            accumulator.append(char)
+        }
+        fun flushAccumulate(char: Char) {
+            forbidWaitingHighSurrogate()
+            if (!expectsJoin()) flush()
+            accumulator.append(char)
+        }
+        fun flushAppend(char: Char) {
+            forbidWaitingHighSurrogate()
+            flush()
+            output.append(char)
+        }
+        fun surrogate(char: Char) {
+            if (char.isHighSurrogate()) {
+                forbidWaitingHighSurrogate()
+                waitingHighSurrogate = char
+            } else {
+                if (waitingHighSurrogate == Char.MIN_VALUE) throw IllegalArgumentException("Invalid Unicode: Low surrogate without high surrogate")
+                if (!expectsJoin()) flush()
+                accumulator.append(waitingHighSurrogate)
+                accumulator.append(char)
+                waitingHighSurrogate = Char.MIN_VALUE
+            }
+        }
+        fun result(): String {
+            flush()
+            return output.toString()
+        }
+    }
+
+    /** Represents a class of input character and its processing method when processing a translation line */
+    private enum class CharClass(val expectsRightJoin: Boolean = false) {
+        None {
+            override fun process(data: LineData, char: Char) = data.flushAppend(char)
+        },
+        Base {
+            override fun process(data: LineData, char: Char) = data.flushAccumulate(char)
+        },
+        LeftJoiner {
+            override fun process(data: LineData, char: Char) = data.accumulate(char)
+        },
+        RightJoiner(true) {
+            override fun process(data: LineData, char: Char) = data.flushAccumulate(char)
+        },
+        LeftRightJoiner(true) {
+            override fun process(data: LineData, char: Char) = data.accumulate(char)
+        },
+        Surrogate {
+            override fun process(data: LineData, char: Char) = data.surrogate(char)
+        };
+        abstract fun process(data: LineData, char: Char)
+    }
+
+    @VisibleForTesting
+    fun getKnownCombinations(): Set<String> = inverseMap.keys
+
+    /** Set at instatiation, if true the translation loader need not bother passing stuff through [remapDiacritics]. */
+    fun isEnabled() = enabled
+
+    private fun getCharClass(char: Char) = charClassMap[char] ?: CharClass.None
+
+    private fun getReplacementChar(joined: String) = inverseMap[joined] ?: createReplacementChar(joined)
+
+    private fun createReplacementChar(joined: String): Char {
+        val char = getCurrentFreeCode()
+        nextFreeDiacriticReplacementCodepoint--
+        if (nextFreeDiacriticReplacementCodepoint < FontRulesetIcons.UNUSED_CHARACTER_CODES_START.toUInt())
+            throw IllegalStateException("DiacriticsSupport has exhausted the Unicode private use area")
+        fakeAlphabet[char] = joined
+        inverseMap[joined] = char
+        return char
+    }
+
+    init {
+        if (enabled) {
+            val rangeStart = if (range.isEmpty()) defaultRangeStart else range.first
+            val rangeEnd = if (range.isEmpty()) defaultRangeEnd else range.last
+            for (char in rangeStart..rangeEnd)
+                charClassMap[char] = charCategoryToClass[char.category] ?: continue
+            for (char in leftDiacritics) charClassMap[char] = CharClass.LeftJoiner
+            for (char in rightDiacritics) charClassMap[char] = CharClass.RightJoiner
+            for (char in joinerDiacritics) charClassMap[char] = CharClass.LeftRightJoiner
+        }
+    }
+
+    /** Replaces the combos of diacritics/joiners with their affected characters with a "fake" alphabet */
+    fun remapDiacritics(value: String): String {
+        if (!enabled)
+            throw IllegalStateException("DiacriticSupport not set up properly for translation processing")
+
+        val data = LineData(value.length)
+        for (char in value) {
+            getCharClass(char).process(data, char)
+        }
+        return data.result()
+    }
+}
--- a/core/src/com/unciv/ui/components/fonts/FontImplementation.kt
+++ b/core/src/com/unciv/ui/components/fonts/FontImplementation.kt
@ -6,7 +6,18 @@ import com.badlogic.gdx.graphics.g2d.BitmapFont
 interface FontImplementation {
    fun setFontFamily(fontFamilyData: FontFamilyData, size: Int)
    fun getFontSize(): Int
-    fun getCharPixmap(char: Char): Pixmap
+
+    /** Why are we having two [getCharPixmap] overloads:
+     *  - The Char one was used alone for a long time. We added the String one for Diacritic support - it still is meant to give one Glyph per input,
+     *    but supports both single characters and short combos of diacritics with their target characters.
+     *  - The desktop implementation currently uses (java.awt) metric.charWidth for the Char overload and metric.stringWidth for the String overload.
+     *  - If there were a guarantee that these were always identical for a char and its toString(), then the Char overload would be redundant.
+     *  - The author just wanted to make 100% sure **nothing** changes for non-Diacritic languages.
+     *  - This could be tested with FasterUIDevelopment, as there the special Char overload is ignored.
+     * */
+    fun getCharPixmap(char: Char) = getCharPixmap(char.toString())
+    fun getCharPixmap(symbolString: String): Pixmap
+
    fun getSystemFonts(): Sequence<FontFamilyData>

    fun getBitmapFont(): BitmapFont {
--- a/core/src/com/unciv/ui/components/fonts/NativeBitmapFontData.kt
+++ b/core/src/com/unciv/ui/components/fonts/NativeBitmapFontData.kt
@ -81,7 +81,8 @@ class NativeBitmapFontData(

        // Check alpha to guess whether this is a round icon
        // Needs to be done before disposing charPixmap, and we want to do that soon
-        val assumeRoundIcon = charPixmap.guessIsRoundSurroundedByTransparency()
+        val isFontRulesetIcon = ch.code >= FontRulesetIcons.UNUSED_CHARACTER_CODES_START && ch <= DiacriticSupport.getCurrentFreeCode()
+        val assumeRoundIcon = isFontRulesetIcon && charPixmap.guessIsRoundSurroundedByTransparency()

        val rect = packer.pack(charPixmap)
        charPixmap.dispose()
@ -89,7 +90,7 @@ class NativeBitmapFontData(
        glyph.srcX = rect.x.toInt()
        glyph.srcY = rect.y.toInt()

-        if (ch.code >= FontRulesetIcons.UNUSED_CHARACTER_CODES_START)
+        if (isFontRulesetIcon)
            glyph.setRulesetIconGeometry(assumeRoundIcon)

        // If a page was added, create a new texture region for the incrementally added glyph.
@ -148,7 +149,9 @@ class NativeBitmapFontData(
                } catch (_: Exception) {
                    Pixmap(0, 0, Pixmap.Format.RGBA8888) // Empty space
                }
-        return fontImplementation.getCharPixmap(ch)
+        if (DiacriticSupport.isEmpty())
+            return fontImplementation.getCharPixmap(ch)
+        return fontImplementation.getCharPixmap(DiacriticSupport.getStringFor(ch))
    }

    override fun getGlyphs(run: GlyphLayout.GlyphRun, str: CharSequence, start: Int, end: Int, lastGlyph: BitmapFont.Glyph?) {
--- a/desktop/src/com/unciv/app/desktop/DesktopFont.kt
+++ b/desktop/src/com/unciv/app/desktop/DesktopFont.kt
@ -61,8 +61,12 @@ class DesktopFont : FontImplementation {
        return font.size
    }

-    override fun getCharPixmap(char: Char): Pixmap {
-        var width = metric.charWidth(char)
+    override fun getCharPixmap(char: Char) = getCharPixmapCommon(char.toString(), metric.charWidth(char))
+
+    override fun getCharPixmap(symbolString: String) = getCharPixmapCommon(symbolString, metric.stringWidth(symbolString))
+
+    private fun getCharPixmapCommon(symbolString: String, measuredWidth: Int): Pixmap {
+        var width = measuredWidth
        var height = metric.height
        if (width == 0) {
            // This happens e.g. for the Tab character
@ -75,7 +79,7 @@ class DesktopFont : FontImplementation {
        g.setRenderingHint(RenderingHints.KEY_ANTIALIASING, RenderingHints.VALUE_ANTIALIAS_ON)
        g.font = font
        g.color = Color.WHITE
-        g.drawString(char.toString(), 0, metric.leading + metric.ascent)
+        g.drawString(symbolString, 0, metric.leading + metric.ascent)

        val pixmap = Pixmap(bi.width, bi.height, Pixmap.Format.RGBA8888)
        val data = bi.getRGB(0, 0, bi.width, bi.height, null, 0, bi.width)
--- a/docs/Other/Translating.md
+++ b/docs/Other/Translating.md
@ -59,6 +59,46 @@ If any of the following steps are beyond your skillset, ask for help. All but th
 - The first function of this entry is alphabetical sorting. Unfortunately, it is not easy to tell whether a specific combination is supported by Java. The simplest way to deal with this is trial and error - once your language is established and playable, see if Civilopedia entries seem properly sorted, if not, open an issue and tell us what _other_, more common language may have better sorting rules.
 - This entry is also required to enable fastlane description upload to a correct folder - however, whether F-Droid supports your language is not guaranteed ([this page](https://f-droid.org/docs/Translation_and_Localization/) should help - but doesn't).

+## Diacritics support
+
+When displaying text, the underlying libraries (libGdx and possibly lwjgl3/GWT) that Unciv uses assume one codepoint in the [UTF-16](https://en.wikipedia.org/wiki/UTF-16) representation corresponds to one rendered glyph,
+which causes incorrect display of languages making heavy use of diacritics or of characters outside the [basic multilinguial plane](https://en.wikipedia.org/wiki/Plane_(Unicode)#Basic_Multilingual_Plane) like most emoji.
+A language file can activate a "trick", where combinations of codepoints that should render as one single glyph are mapped into a "fake alphabet",
+which is created on the fly in the [Private Use Area](https://en.wikipedia.org/wiki/Private_Use_Areas) defined by Unicode.
+
+To activate this feature, set `diacritics_support = true` in your translation. There are a few additional "settings" - translation lines where the "translation" is some control instruction instead.
+All of these are optional, though your language may show glitches unless you define some. For example, Bangla _needs_ a definition for U+09CD, where the Unicode category does not fully define the required behaviour.
+
+Each of the following definitions represents zero or more characters, and can simply list them as one string.
+For readability, they can also be quoted (" surrounding the entire definition), characters can be separated by spaces, or you can use standard "U+xxxx" representations (these need space separators).
+These entries, unlike the rest of a translation file, also support entry-specific comments: After the code(s), from a '#' to the end of the line.
+Search for the information about the Unicode support in your language, e.g. on https://www.unicode.org/charts/ for information on which codes you might need to specify.
+If your language does not need these, feel free to ignore, or use "" to avoid the "requires translation" mark.
+
+### Limitations
+- Consider this feature as being in an experimental stage.
+- Can only work if the language's script still consists of individual glyphs rendered left to right.
+- The underlying libraries (Java AWT or Android) must be able to render the combinations you need - sometimes you will need to select and possibly install a specific font to see the intended results.
+- Using diacritics support incurs a performance penalty, but mostly when loading languages starting Unciv.
+- The feature also has a "quantity" limitation from the size of the Unicode "Private Use Area" (only the one in the BMP can be used), and this must be shared by the feature whereby Unciv automatically displays ruleset object icons:
+  The total number of distinct diacritic "combinations" (or glyphs) your translation actually uses plus the number of objects in the loaded mods (or vanilla ruleset) must not exceed 6400.
+- When enabled and the range is the default (or spans the unicode range for surrogates), then the engine will treat Unicode surrogate pairs correctly, assigning a fake alphabet codepoint for them and allowing diacritics to include them in a combo.
+  However, the parser is strict and throws an Exception on violations of the UTF-16 standard. If your translation crashes Unciv, check your editor for incorrect Unicode handling
+  (translation files are UTF-8 not UTF-16, but unfortunately most encoding converters allow transferring mismatched surrogate pairs).
+  Also, this possibility could so far not be successfully tested for emoji - no supporting font found, see "experimental".
+
+### Settings (as translation entries in the language file)
+- `diacritics_support`: This entry must be set to "true" for the diacritics support to work at all. Any other value will cause text to be passed through unchanged.
+- `unicode_block_start_character` and `unicode_block_end_character`: These define the range of characters that should be considered. One character or code each. Defaults to the entire BMP range.
+  All characters in this range will be categorized, those undefined by Unicode, controls or punctuation, or those outside the range will pass through and reset the diacritics engine for the rest of the line, that is, pending potential combinations will be flushed.
+  Limiting this range - e.g. to the Unicode page dedicated to your language - is a performance optimization, but ultimately not required.
+- `left_joining_diacritics`: Optionally define additional codes meant to join with the character to the left of them, by default the unicode categories "Mn" and "Mc" within the range described above are used.
+- `right_joining_diacritics`: Optionally define additional codes meant to join with the character to the right of them, by default none.
+- `left_and_right_joiners`: Optionally define additional codes meant to join with the character to the left AND with the character to the right, by default none
+
+These are processed in listed order and can override previous categorizations per character codepoint.
+Thus a code specified in `left_and_right_joiners` can be in the "Mn" unicode category, which would put it into the `left_joining_diacritics`, but will still work, because the later definition overrides the earlier one.
+
 ## Why not use a crowdsourcing translation website like <...>?

 1. Testing. Currently, translations undergo a number of tests for verification. This allows some language changes to be accepted and others not, and it's all in the same platform with the same tests. External translation tools don't allow for this.
--- a/tests/src/com/unciv/dev/FasterUIDevelopment.kt
+++ b/tests/src/com/unciv/dev/FasterUIDevelopment.kt
@ -180,8 +180,8 @@ class FontDesktop : FontImplementation {

    override fun getFontSize() = Fonts.ORIGINAL_FONT_SIZE.toInt()

-    override fun getCharPixmap(char: Char): Pixmap {
-        var width = metric.charWidth(char)
+    override fun getCharPixmap(symbolString: String): Pixmap {
+        var width = metric.stringWidth(symbolString)
        var height = metric.height
        if (width == 0) {
            height = Fonts.ORIGINAL_FONT_SIZE.toInt()
@ -193,7 +193,7 @@ class FontDesktop : FontImplementation {
        g.setRenderingHint(RenderingHints.KEY_ANTIALIASING, RenderingHints.VALUE_ANTIALIAS_ON)
        g.font = font
        g.color = java.awt.Color.WHITE
-        g.drawString(char.toString(), 0, metric.leading + metric.ascent)
+        g.drawString(symbolString, 0, metric.leading + metric.ascent)

        val pixmap = Pixmap(bi.width, bi.height, Pixmap.Format.RGBA8888)
        val data = bi.getRGB(0, 0, bi.width, bi.height, null, 0, bi.width)
--- a/tests/src/com/unciv/logic/TranslationTests.kt
+++ b/tests/src/com/unciv/logic/TranslationTests.kt
@ -16,13 +16,14 @@ import com.unciv.models.translations.getPlaceholderText
 import com.unciv.models.translations.squareBraceRegex
 import com.unciv.models.translations.tr
 import com.unciv.testing.GdxTestRunner
+import com.unciv.testing.RedirectOutput
+import com.unciv.testing.RedirectPolicy
+import com.unciv.ui.components.fonts.DiacriticSupport
 import com.unciv.utils.Log
 import org.junit.Assert
 import org.junit.Before
 import org.junit.Test
 import org.junit.runner.RunWith
-import java.io.OutputStream
-import java.io.PrintStream

@RunWith(GdxTestRunner::class)
 class TranslationTests {
@ -30,27 +31,23 @@ class TranslationTests {
    private var ruleset = Ruleset()

    @Before
+    // Since the ruleset and translation loader have their own output,
+    // We 'disable' the output stream for their outputs, and only enable it for the test itself.
+    @RedirectOutput(RedirectPolicy.Discard)
    fun loadTranslations() {
-        // Since the ruleset and translation loader have their own output,
-        // We 'disable' the output stream for their outputs, and only enable it for the test itself.
-        val outputChannel = System.out
-        System.setOut(PrintStream(object : OutputStream() {
-            override fun write(b: Int) {}
-        }))
        translations.readAllLanguagesTranslation()
        RulesetCache.loadRulesets(noMods = true)
        ruleset = RulesetCache.getVanillaRuleset()
-        System.setOut(outputChannel)
    }

    @Test
    fun translationsLoad() {
-        Assert.assertTrue("This test will only pass there are translations",
+        Assert.assertTrue("This test will only pass if there are translations",
                translations.size > 0)
    }


-    // This test is incorrectly defined: it should read from the template.properties file and not fro the final translation files.
+    // This test is incorrectly defined: it should read from the template.properties file and not from the final translation files.
 //    @Test
 //    fun allUnitActionsHaveTranslation() {
 //        val actions: MutableSet<String> = HashSet()
@ -325,6 +322,83 @@ class TranslationTests {
        Assert.assertTrue(Stats.isStats(Stats(1f,2f,3f).toStringForNotifications()))
    }

+    @Test
+    fun diacriticsSilentForEnglish() {
+        DiacriticSupport.reset()
+        val english = translations.values
+            .mapNotNull { entry ->
+                entry["English"]?.let { entry.entry to it }
+            }.toMap(HashMap())
+        val diacriticSupport = DiacriticSupport(english)
+        Assert.assertFalse(diacriticSupport.isEnabled())
+    }
+
+    @Test
+    fun diacriticsWorkForBangla() {
+        //todo This test was designed before the Bangla language was merged, and uses its own data.
+        //     With the actual Bangla, the @before will already have loaded that, so there could be an additional, or a different test on the live one...
+
+        // Here's a helper to _generate_ the expected from the listOf:
+        // https://play.kotlinlang.org/#eyJ2ZXJzaW9uIjoiMi4wLjAiLCJwbGF0Zm9ybSI6ImphdmEiLCJhcmdzIjoiIiwibm9uZU1hcmtlcnMiOnRydWUsInRoZW1lIjoiaWRlYSIsImNvZGUiOiJwYWNrYWdlIHRvdWhpZHVycnJcblxuY29uc3QgdmFsIEJBTkdMQV9DSEFSU0VUX1NUQVJUID0gMHgwOTgwXG5jb25zdCB2YWwgQkFOR0xBX0NIQVJTRVRfRU5EID0gMHgwOWZmXG5cbnZhbCBCQU5HTEFfRElBQ1JJVElDUyA9IGxpc3RPZihcbiAgICAn4KaBJywgJ+CmgicsICfgpoMnLCAn4Ka8JyxcbiAgICAn4Ka+JywgJ+CmvycsICfgp4AnLCAn4KeBJyxcbiAgICAn4KeCJywgJ+CngycsICfgp4QnLCAn4KeHJyxcbiAgICAn4KeIJywgJ+CniycsICfgp4wnLCAn4KeNJyxcbiAgICAn4KeXJywgJ+CnoicsICfgp6MnLCAn4Ke+JyxcbilcblxuY29uc3QgdmFsIEJBTkdMQV9KT0lORVIgPSAn4KeNJ1xuXG5mdW4gaXNCYW5nbGFDaGFyKGNoOiBDaGFyKTogQm9vbGVhbiB7XG4gICAgcmV0dXJuIGNoLmNvZGUgPj0gQkFOR0xBX0NIQVJTRVRfU1RBUlQgJiYgY2guY29kZSA8PSBCQU5HTEFfQ0hBUlNFVF9FTkRcbn1cblxudmFsIGFsbFNlcXVlbmNlcyA9IG11dGFibGVTZXRPZjxTdHJpbmc+KClcblxuZnVuIG1haW4oKSB7XG4gICAgdmFsIGxpbmVzID0gbGlzdE9mKFxuICAgICAgICBcIuCmruCmvuCmqOCmmuCmv+CmpOCnjeCmsCDgprjgpq7gp43gpqrgpr7gpqbgppVcIiwgXCLgpqbgp4fgppbgp4HgpqhcIiwgXCLgpongp47gpqrgpqjgp43gpqgg4KaV4Kaw4KeB4KaoXCIsIFwi4KaG4KaC4Ka24Ka/4KaVXCIsIFwi4KaW4KeN4Kaw4Ka/4Ka34KeN4Kaf4Kaq4KeC4Kaw4KeN4KasXCIsIFwi4Ka44KaC4KaV4KeN4Ka34Ka/4Kaq4KeN4KakXCIsIFwi4Ka24KaV4KeN4Kak4Ka/XCIsIFwi4Ka34KeN4Kag4KeN4Kav4KeHXCJcbiAgICApXG5cbiAgICBsaW5lcy5mb3JFYWNoIHsgbGluZSAtPlxuICAgICAgICB2YWwgbGFzdFNlcXVlbmNlID0gU3RyaW5nQnVpbGRlcigpXG4gICAgICAgIGxpbmUuZm9yRWFjaCB7IGNoIC0+XG4gICAgICAgICAgICBpZiAoIWlzQmFuZ2xhQ2hhcihjaCkpIHtcbiAgICAgICAgICAgICAgICBpZiAobGFzdFNlcXVlbmNlLmlzTm90RW1wdHkoKSkge1xuICAgICAgICAgICAgICAgICAgICBhbGxTZXF1ZW5jZXMuYWRkKGxhc3RTZXF1ZW5jZS50b1N0cmluZygpKVxuICAgICAgICAgICAgICAgICAgICBsYXN0U2VxdWVuY2UuY2xlYXIoKVxuICAgICAgICAgICAgICAgIH1cbiAgICAgICAgICAgICAgICByZXR1cm5AZm9yRWFjaFxuICAgICAgICAgICAgfVxuXG4gICAgICAgICAgICBpZiAoQkFOR0xBX0RJQUNSSVRJQ1MuY29udGFpbnMoY2gpKSB7XG4gICAgICAgICAgICAgICAgbGFzdFNlcXVlbmNlLmFwcGVuZChjaClcbiAgICAgICAgICAgICAgICByZXR1cm5AZm9yRWFjaFxuICAgICAgICAgICAgfVxuXG4gICAgICAgICAgICBpZiAobGFzdFNlcXVlbmNlLmlzTm90RW1wdHkoKSAmJiBsYXN0U2VxdWVuY2UubGFzdCgpICE9IEJBTkdMQV9KT0lORVIpIHtcbiAgICAgICAgICAgICAgICBhbGxTZXF1ZW5jZXMuYWRkKGxhc3RTZXF1ZW5jZS50b1N0cmluZygpKVxuICAgICAgICAgICAgICAgIGxhc3RTZXF1ZW5jZS5jbGVhcigpXG4gICAgICAgICAgICB9XG4gICAgICAgICAgICBsYXN0U2VxdWVuY2UuYXBwZW5kKGNoKVxuICAgICAgICB9XG5cbiAgICAgICAgaWYgKGxhc3RTZXF1ZW5jZS5pc05vdEVtcHR5KCkpIHtcbiAgICAgICAgICAgIGFsbFNlcXVlbmNlcy5hZGQobGFzdFNlcXVlbmNlLnRvU3RyaW5nKCkpXG4gICAgICAgIH1cbiAgICB9XG5cbiAgICBwcmludGxuKGFsbFNlcXVlbmNlcy5maWx0ZXIgeyBpdC5sZW5ndGggPiAxIH0uam9pblRvU3RyaW5nKFwiXFxcIiwgXFxcIlwiLCBcInZhbCBleHBlY3RlZCA9IHNldE9mKFxcXCJcIiwgXCJcXFwiKVwiKSB7IGl0LmFzU2VxdWVuY2UoKS5qb2luVG9TdHJpbmcoKSB9KVxufSJ9
+
+        DiacriticSupport.reset()
+        val leftJoiningDiacritics = "ঁ ং ঃ ় া ি ী ু ূ ৃ ৄ ে ৈ ো ৌ ্ ৗ ৢ ৣ ৾".replace(" ", "")
+        val leftAndRightJoiners = "্"
+        val diacriticSupport = DiacriticSupport(true, Char(0x0980U)..Char(0x09FDU), leftJoiningDiacritics, "", leftAndRightJoiners)
+
+        listOf(
+            "মানচিত্র সম্পাদক", "দেখুন", "উৎপন্ন করুন", "আংশিক", "খ্রিষ্টপূর্ব", "সংক্ষিপ্ত", "শক্তি", "ষ্ঠ্যে"
+        ).forEach { diacriticSupport.remapDiacritics(it) }
+        val actual = diacriticSupport.getKnownCombinations()
+        val expected = setOf(
+                "ম, া", "চ, ি", "ত, ্, র", "ম, ্, প, া", "দ, ে", "খ, ু", "ন, ্, ন", "র, ু", "আ, ং", "শ, ি",
+                "খ, ্, র, ি", "ষ, ্, ট", "প, ূ", "র, ্, ব", "স, ং", "ক, ্, ষ, ি", "প, ্, ত", "ক, ্, ত, ি",
+                "ষ, ্, ঠ, ্, য, ে"
+            )
+            .map {
+                it.split(", ").joinToString("")
+            }.toSet()
+        Assert.assertEquals(expected, actual)
+    }
+
+    /** This test requires an actual translation for "Overview" from Bangla.properties, it passes silently without one.
+     *  It's also dependent on the translation itself at the time of writing and will fail if a translator changes it.
+     */
+    @Test
+    // @RedirectOutput(RedirectPolicy.Show) // has good visualization - comment in and run to see how the fake alphabet actually works
+    fun diacriticsTestBanglaRoundtrip() {
+        testRoundtrip("Bangla", "Overview", "সংক্ষিপ্ত বিবরণী")
+    }
+
+    @Test
+    fun testNonBasePlaneUnicode() {
+        translations.createTranslations("Test", hashMapOf("Test" to "Test\uD83D\uDC4D", "diacritics_support" to "true"))
+        testRoundtrip("Test", "Test", "Test\uD83D\uDC4D") { translated ->
+            val isOK = translated.startsWith("Test") && translated.length == 5 && translated.last() > DiacriticSupport.getCurrentFreeCode()
+            Assert.assertTrue("Translation with one emoji should have exactly one fake alphabet codepoint", isOK)
+        }
+    }
+
+    private fun testRoundtrip(language: String, term: String, input: String, additionalTest: ((String)->Unit)? = null) {
+        UncivGame.Current = UncivGame()
+        UncivGame.Current.settings = GameSettings()
+        UncivGame.Current.settings.language = language
+        for ((key, value) in translations)
+            UncivGame.Current.translations[key] = value
+
+        val translated = term.tr()
+        if (translated == term) return // No translation present, can't test
+
+        val output = translated.asIterable().joinToString("") { DiacriticSupport.getStringFor(it) }
+
+        fun Char.hex() = "U+" + code.toString(16).padStart(4, '0')
+        fun String.hex() = asIterable().joinToString(" ") { it.hex() }
+        fun String.literalAndHex() = "\"$this\" = ${hex()}"
+        val translatedHex = translated.asIterable().joinToString("; ") { it.hex() + " -> " + DiacriticSupport.getStringFor(it).literalAndHex() }
+        println("Mapping '$term' in $language to fake alphabet and back:\n\tinput: ${input.literalAndHex()}\n\ttranslated: $translatedHex\n\toutput: ${output.literalAndHex()}")
+        Assert.assertEquals(input, output)
+        additionalTest?.invoke(translated)
+    }

 //    @Test
 //    fun allConditionalsAreContainedInConditionalOrderTranslation() {