Jump to content
Main menu
Main menu
move to sidebar
hide
Navigation
Main page
Recent changes
Random page
Help about MediaWiki
Humanipedia
Search
Search
Appearance
Create account
Log in
Personal tools
Create account
Log in
Pages for logged out editors
learn more
Contributions
Talk
Editing
Module:Unicode data/testcases
Module
Discussion
English
Read
Edit source
View history
Tools
Tools
move to sidebar
hide
Actions
Read
Edit source
View history
General
What links here
Related changes
Special pages
Page information
Appearance
move to sidebar
hide
Warning:
You are not logged in. Your IP address will be publicly visible if you make any edits. If you
log in
or
create an account
, your edits will be attributed to your username, along with other benefits.
Anti-spam check. Do
not
fill this in!
local p = require "Module:UnitTests" local Unicode_data = require(mw.title.getCurrentTitle().subpageText == "sandbox" and "Module:Unicode data/sandbox" or "Module:Unicode data") local U = mw.ustring.char local get_codepoint = mw.ustring.codepoint local function show(codepoint) if Unicode_data.is_printable(codepoint) then local printed_codepoint = U(codepoint) if mw.ustring.toNFC(printed_codepoint) ~= printed_codepoint then printed_codepoint = ("&#x%X;"):format(codepoint) end if Unicode_data.is_combining(codepoint) then printed_codepoint = "◌" .. printed_codepoint end return ("U+%04X: %s"):format(codepoint, printed_codepoint) else return ("U+%04X"):format(codepoint) end end local function show_codepoint_and_name(codepoint) return ("%s (%s)"):format(show(codepoint), Unicode_data.lookup_name(codepoint)) end function p:test_lookup_name() local examples = { { 0x0000, "<control-0000>" }, { 0x007F, "<control-007F>" }, { 0x00C1, "LATIN CAPITAL LETTER A WITH ACUTE" }, { 0x0300, "COMBINING GRAVE ACCENT" }, { 0x0378, "<reserved-0378>" }, { 0x1B44, "BALINESE ADEG ADEG" }, { 0x1F71, "GREEK SMALL LETTER ALPHA WITH OXIA" }, { 0x3555, "CJK UNIFIED IDEOGRAPH-3555" }, { 0xAC01, "HANGUL SYLLABLE GAG" }, { 0xD5FF, "HANGUL SYLLABLE HEH" }, { 0xDC00, "<surrogate-DC00>", }, { 0xEEEE, "<private-use-EEEE>" }, { 0xFDD1, "<noncharacter-FDD1>", }, { 0xFFFD, "REPLACEMENT CHARACTER" }, { 0xFFFF, "<noncharacter-FFFF>" }, { 0x1F4A9, "PILE OF POO" }, { 0xE0000, "<reserved-E0000>" }, { 0xF0F0F, "<private-use-F0F0F>" }, { 0x10FFFF, "<noncharacter-10FFFF>" }, } self:iterate(examples, function (self, codepoint, name) self:equals(show(codepoint), Unicode_data.lookup_name(codepoint), name) end) end function p:test_lookup_age() local examples = { { 0x0061, "1.1" }, { 0x0378, "NA" }, { 0x1B44, "5.0" }, { 0x3555, "3.0" }, { 0xAC01, "2.0" }, { 0xDC00, "2.0", }, { 0xEEEE, "1.1" }, { 0xFDD1, "3.1", }, { 0x1F4A9, "6.0" }, { 0xE0000, "NA" }, { 0xF0F0F, "2.0" }, { 0x10FFFF, "2.0" }, } self:iterate(examples, function (self, codepoint, age) -- Remove pcall when this function is added to [[Module:Unicode data]]. pcall(function () self:equals(show(codepoint), Unicode_data.lookup_age(codepoint), age) end) end) end function p:test_is_combining() local examples = { { 0x0300, true }, { 0x0060, false }, } self:iterate(examples, function (self, codepoint, expected) self:equals( show_codepoint_and_name(codepoint), Unicode_data.is_combining(codepoint), expected) end) end function p:test_is_default_ignorable() local examples = { { 0x0061, false }, { 0x00AD, true }, } self:iterate(examples, function (self, codepoint, expected) -- Remove pcall when this function is added to [[Module:Unicode data]]. pcall(function () self:equals( show_codepoint_and_name(codepoint), Unicode_data.is_default_ignorable(codepoint), expected) end) end) end function p:test_lookup_script() local examples = { { 0x0061, "Latn" }, { 0x002F, "Zyyy" }, { 0x0300, "Zinh" }, { 0x0378, "Zzzz" }, { 0x0398, "Grek" }, { 0x03E2, "Copt" }, { 0x2014, "Zyyy" }, } self:iterate(examples, function (self, codepoint, expected) self:equals( show_codepoint_and_name(codepoint), Unicode_data.lookup_script(codepoint), expected) end) end function p:test_lookup_category() local examples = { { get_codepoint "\t", "Cc" }, { get_codepoint " ", "Zs" }, { get_codepoint "[", "Ps" }, { get_codepoint "]", "Pe" }, { get_codepoint "^", "Sk" }, { get_codepoint "A", "Lu" }, { 0x00AD, "Cf" }, { get_codepoint "¾", "No" }, { get_codepoint "«", "Pi" }, { get_codepoint "»", "Pf" }, { 0x0300, "Mn" }, { 0x0488, "Me" }, { get_codepoint "٣", "Nd" }, { get_codepoint "子", "Lo" }, { get_codepoint "ᾮ", "Lt" }, { 0x1B44, "Mc" }, { get_codepoint "∈", "Sm" }, { get_codepoint "‿", "Pc" }, { get_codepoint "↹", "So" }, { get_codepoint "⸗", "Pd" }, { get_codepoint "Ⅷ", "Nl" }, { 0x2028, "Zl" }, { 0x2029, "Zp" }, { get_codepoint "ゞ", "Lm" }, { 0xD800, "Cs" }, { get_codepoint "£", "Sc" }, { 0xFFFF, "Cn" }, { 0x100000, "Co" }, } self:iterate(examples, function (self, codepoint, expected) self:equals( show_codepoint_and_name(codepoint), Unicode_data.lookup_category(codepoint), expected) end) end local fun = require "Module:Fun" local m_table = require "Module:TableTools" local script_to_count_mt = { __index = function (self, key) self[key] = 0 return 0 end, __call = function (self, ...) return setmetatable({}, self) end } setmetatable(script_to_count_mt, script_to_count_mt) local script_counts = setmetatable({}, { __index = function (self, str) if type(str) ~= "string" then return nil end local script_to_count = script_to_count_mt() for codepoint in mw.ustring.gcodepoint(str) do local script = Unicode_data.lookup_script(codepoint) script_to_count[script] = script_to_count[script] + 1 end local printed = table.concat( fun.mapIter( function (count, script) return ("%s (%d)"):format(script, count) end, m_table.sortedPairs( script_to_count, function (script1, script2) return script_to_count[script1] > script_to_count[script2] end)), ", ") self[str] = printed return printed end, }) local script_examples = { -- To demonstrate that "is_Latin" doesn't treat a string of Zyyy and Zinh -- characters as Latn. -- This particular example only has characters below U+0340, so -- lookup_script doesn't have to be called. { "%!?́", nil }, { "’ʼ“”†‡•‰′‽⁕", nil }, { "col·legi", "Latn" }, "HTML character references", { "𐘀", "Lina" }, { "𐘀", "Lina" }, { "–", nil }, { "–", nil }, -- Examples from [[Template talk:Lang#Italicisation of Halkomelem]] "Halkomelem", { "lá:yelhp", "Latn" }, { "xʷməθkʷəy̓əm", nil }, -- one Greek (Grek) character { "hən̓q̓əmin̓əm̓", "Latn" }, "Quotes", -- [[s:it:Divina Commedia/Inferno/Canto I]] { [[Tant’è amara che poco è più morte; ma per trattar del ben ch’i’ vi trovai, dirò de l’altre cose ch’i’ v’ ho scorte.]], "Latn" }, { -- A blessing in Navajo: --[[User talk:Stephen G. Brown/text8]] [[Díí Késhmish biyiʼ yáʼąąshdę́ę́ʼ ląʼígóó bee nikʼihojidlíi dooleeł. Niheechʼínáánáháhígíí biyiʼ iłhodeezyéél, iłhózhǫ́, ayóóʼóʼóʼní bee nikʼihojidlíi dooleeł. Tʼáá sahdiigiʼ átʼéego baa hózhǫ́ǫgo nihił hanááhoolzhiizhígí biyiʼ tʼáá ałtsojįʼ iłhózhǫ́ nííʼ dooleeł.]], "Latn" }, { -- The opening of the Iliad ([[s:el:Ιλιάς/Α]]), with macrons and -- breves added to mark the length of the monophthongs α, ι, υ: [[Μῆνῐν ᾰ̓́ειδε, θεᾱ́, Πηληῐ̈ᾰ́δεω Ᾰ̓χῐλῆος οὐλομένην, ἣ μῡρῐ́᾽ Ᾰ̓χαιοῖς ᾰ̓́λγε᾽ ἔθηκε, πολλᾱ̀ς δ᾽ ῐ̓φθῑ́μους ψῡχᾱ̀ς Ἄῐ̈δῐ προῐ̈́ᾰψεν ἡρώων, αὐτοὺς δὲ ἑλώρῐᾰ τεῦχε κῠ́νεσσιν οἰωνοῖσῐ́ τε πᾶσῐ· Δῐὸς δ᾽ ἐτελείετο βουλή·]], "Grek" }, { -- The Brothers Karamazov: [[w:ru:Братья Карамазовы (Достоевский)/Книга первая]] [[Вот если вы не согласитесь с этим последним тезисом и ответите: «Не так» или «не всегда так», то я, пожалуй, и ободрюсь духом насчет значения героя моего Алексея Федоровича. Ибо не только чудак «не всегда» частность и обособление, а напротив, бывает так, что он-то, пожалуй, и носит в себе иной раз сердцевину целого, а остальные люди его эпохи — все, каким-нибудь наплывным ветром, на время почему-то от него оторвались…]], "Cyrl" }, { -- Rig Veda: [[https://sa.wikisource.org/wiki/ऋग्वेदः_सूक्तं_१.१]] [[ॐ अग्निमीळे पुरोहितं यज्ञस्य देवमृत्विजम् । होतारं रत्नधातमम् ॥१॥ अग्निः पूर्वेभिरृषिभिरीड्यो नूतनैरुत । स देवाँ एह वक्षति ॥२॥ अग्निना रयिमश्नवत् पोषमेव दिवेदिवे । यशसं वीरवत्तमम् ॥३॥ अग्ने यं यज्ञमध्वरं विश्वतः परिभूरसि । स इद्देवेषु गच्छति ॥४॥ अग्निर्होता कविक्रतुः सत्यश्चित्रश्रवस्तमः । देवो देवेभिरा गमत् ॥५॥ यदङ्ग दाशुषे त्वमग्ने भद्रं करिष्यसि । तवेत् तत् सत्यमङ्गिरः ॥६॥ उप त्वाग्ने दिवेदिवे दोषावस्तर्धिया वयम् । नमो भरन्त एमसि ॥७॥ राजन्तमध्वराणां गोपामृतस्य दीदिविम् । वर्धमानं स्वे दमे ॥८॥ स नः पितेव सूनवेऽग्ने सूपायनो भव । सचस्वा नः स्वस्तये ॥९॥]], "Deva" }, } local ends_in_punctuation = setmetatable({}, { __index = function (self, key) local val = mw.ustring.match(mw.ustring.sub(key, -1), "%p") ~= nil self[key] = val return val end, }) local function show_script_example(script_example) local separator = ": " -- If last character is punctuation, place script counts on their own line -- Could use Unicode_data.lookup_category, but that is more memory-intensive. if ends_in_punctuation[script_example] then separator = "<br>• " end return script_example:gsub('\n', '<br>') .. separator .. script_counts[script_example] end function p:test_get_best_script() self:iterate(script_examples, function (self, str, expected) self:equals( show_script_example(str), Unicode_data.get_best_script(str), expected) end) end function p:test_is_Latin() self:iterate(script_examples, function (self, str, best_script, is_Latin) self:equals(show_script_example(str), Unicode_data.is_Latin(str), is_Latin or best_script == "Latn") end) end function p:test_lookup_block() local examples = { { 0x0064, "Basic Latin" }, { 0x030B, "Combining Diacritical Marks" }, { 0x03A3, "Greek and Coptic" }, { 0x0411, "Cyrillic" }, { 0x10E6, "Georgian" }, { 0x3175, "Hangul Compatibility Jamo" }, { 0xAC01, "Hangul Syllables" }, { 0x4E0A, "CJK Unified Ideographs" }, { 0x1F608, "Emoticons" }, { 0x30000, "CJK Unified Ideographs Extension G"}, { 0x10FFFF, "Supplementary Private Use Area-B" }, } self:iterate(examples, function (self, codepoint, block_name) self:equals( show(codepoint), Unicode_data.lookup_block(codepoint), block_name) end) end function p:test_is_rtl() local examples = { { "أبو عبد الله محمد بن عبد الله اللواتي الطنجي بن بطوطة", true }, -- Ibn Battuta's full name { "أدب القاضي Adab al-qādī", false }, -- Example of incorrect input { "ܛܘܼܒܲܝܗܘܿܢ ܠܐܲܝܠܹܝܢ ܕܲܕ݂ܟܹܝܢ ܒܠܸܒ̇ܗܘܿܢ܄ ܕܗܸܢ݂ܘܿܢ ܢܸܚܙܘܿܢ ܠܐܲܠܵܗܵܐ܂", true }, -- Syriac, sixth beatitude (Matthew 5:8) { "בְּרֵאשִׁית, בָּרָא אֱלֹהִים, אֵת הַשָּׁמַיִם, וְאֵת הָאָרֶץ.", true }, -- Hebrew: Genesis 1:1 { "𞤀𞤣𞤤𞤢𞤥 𞤆𞤵𞤤𞤢𞤪", true }, -- Adlam: name of alphabet { -- Avestan: Hymn to Haoma: Yasna 10.8 ([[wikt:𐬀𐬉𐬴𐬨𐬀]]) "𐬬𐬍𐬯𐬞𐬈 ⸱ 𐬰𐬍 ⸱ 𐬀𐬥𐬌𐬌𐬉 ⸱ 𐬨𐬀𐬜𐬃𐬢𐬵𐬋 ⸱ 𐬀𐬉𐬴𐬨𐬀 ⸱ 𐬵𐬀𐬗𐬌𐬧𐬙𐬈 ⸱ 𐬑𐬭𐬎𐬎𐬍𐬨 ⸱ 𐬛𐬭𐬎𐬎𐬋 ⸱ 𐬁𐬀𐬝 ⸱ 𐬵𐬋 ⸱ 𐬫𐬋 ⸱ 𐬵𐬀𐬊𐬨𐬀𐬵𐬈 ⸱ 𐬨𐬀𐬜𐬋 ⸱ 𐬀𐬴𐬀 ⸱ 𐬵𐬀𐬗𐬀𐬌𐬙𐬈", true }, { "ދިވެހި", true }, -- the word dhivehi written in Thaana script { "𐤀𐤓𐤍𐤟𐤆𐤐𐤏𐤋𐤟𐤀𐤕𐤁𐤏𐤋𐤟𐤁𐤍𐤀𐤇𐤓𐤌𐤟𐤌𐤋𐤊𐤂𐤁𐤋𐤟𐤋𐤀𐤇𐤓𐤌𐤟𐤀𐤁𐤄", true }, -- Phoenician: Ahiram sarcophagus ([[wikt:𐤀𐤓𐤍]]) { "ࡌࡀࡍࡃࡀ ࡖࡄࡉࡉࡀ", true }, -- Mandaic: manda ḏ'haije ("knowledge of life"; [[wikt:ࡌࡀࡍࡃࡀ ࡖࡄࡉࡉࡀ]]) { "ࠄࠟࠓࠂࠝࠓࠜࠉࠆࠜࠉࠌ", true }, -- Samaritan Hebrew: īargerēzēm ("Mount Gerizim"; [[wikt:Mount Gerizim]]) { "%$!^&", false }, } self:iterate(examples, function (self, str, expected) self:equals(str, Unicode_data.is_rtl(str), expected) end) end -- Change function names into more readable headers for the testcases tables. for k, v in m_table.sortedPairs(p) do if type(k) == "string" then local new_k = k:gsub("^test_(.+)$", "testcases for <code>%1</code>") if new_k ~= k then p[k] = nil p[new_k] = v end end end return p
Summary:
Please note that all contributions to Humanipedia may be edited, altered, or removed by other contributors. If you do not want your writing to be edited mercilessly, then do not submit it here.
You are also promising us that you wrote this yourself, or copied it from a public domain or similar free resource (see
Humanipedia:Copyrights
for details).
Do not submit copyrighted work without permission!
Cancel
Editing help
(opens in new window)
Template used on this page:
Module:Unicode data/testcases/doc
(
edit
)