Module:data consistency check

Definition from Wiktionary, the free dictionary
Jump to: navigation, search

This module performs a consistency check on the various data modules used on Wiktionary.

Checks performed[edit]

For multiple data modules:

  • Codes for languages, families and etymology-only languages must be unique and cannot clash with one another.

Codes in Module:languages data must:

  • Be defined in the correct submodule according to whether the code is two-letter, three-letter or exceptional.
  • Have canonicalName, which must not be the same as the canonical name of another language.
  • If scripts is given, then each script in the list must be a valid script code.
  • If family is given, it must be a valid family code.
  • If type is given, it must be one of the recognised values (regular, reconstructed, appendix-constructed).

Codes in Module:etymology languages data must:

  • Have canonicalName.
  • Have parent, which must be a valid language, family or etymology-only language code.

Codes in Module:families data must:

  • Have canonicalName, which must not be the same as the canonical name of another family.
  • If family is given, it must be a valid family code.
  • Have at least one language or subfamily belonging to it.

Codes in Module:scripts data must:

  • Have canonicalName.
  • Have at least one language that lists it as one of its scripts.
  • Have characters for script autodetection, which must a Lua string pattern that matches any character in the script.

Output[edit]

Discrepancies detected:

Module:families/data

Module:scripts/data


local export = {}

local messages = {}

local function discrepancy(modname, ...)
	if not messages[modname] then
		messages[modname] = {}
	end
	
	table.insert(messages[modname], string.format(...))
end

local all_codes = {}

local language_names = {}
local family_names = {}
local script_names = {}

local nonempty_fams = {}
local nonempty_scrs = {}

local function check_languages()
	local m_family_data = mw.loadData('Module:families/data')
	local m_script_data = mw.loadData('Module:scripts/data')
	
	local function link(name)
		if not name then
			return "???"
		elseif name:find("[Ll]anguage$") then
			return "[[:Category:" .. name .. "|" .. name .. "]]"
		else
			return "[[:Category:" .. name .. " language|" .. name .. " language]]"
		end
	end
	
	local function check_language(modname, code, data)
		if all_codes[code] then
			discrepancy(modname, "Code <code>%s</code> is not unique, is also defined in [[Module:%s]].", code, all_codes[code])
		else
			all_codes[code] = modname
		end
		
		if not data.canonicalName then
			discrepancy(modname, "Code <code>%s</code> has no canonical name specified.", code)
		elseif language_names[data.canonicalName] then
			discrepancy(modname, "%s (<code>%s</code>) has a canonical name that is not unique, it is also used by the code <code>%s</code>.", link(data.canonicalName), code, language_names[data.canonicalName])
		else
			language_names[data.canonicalName] = code
		end
		
		if data.type and (data.type ~= "regular") and (data.type ~= "reconstructed") and (data.type ~= "appendix-constructed") then
			discrepancy(modname, "%s (<code>%s</code>) is of an invalid type <code>%s</code>.", link(data.canonicalName), code, data.type)
		end
		
		if data.scripts then
			if not data.scripts[1] then
				discrepancy(modname, "%s (<code>%s</code>) has no scripts listed.", link(data.canonicalName), code)
			else
				for i, sccode in ipairs(data.scripts) do
					if not m_script_data[sccode] then
						discrepancy(modname, "%s (<code>%s</code>) lists an invalid script code <code>%s</code>.", link(data.canonicalName), code, sccode)
					end
		
					nonempty_scrs[sccode] = true
				end
			end
		end
		
		if data.family then
			if not m_family_data[data.family] then
				discrepancy(modname, "%s (<code>%s</code>) has an invalid family code <code>%s</code>.", link(data.canonicalName), code, data.family)
			end
			
			nonempty_fams[data.family] = true
		end
	end
	
	-- Check two-letter codes
	local modname = "languages/data2"
	local data2 = mw.loadData("Module:" .. modname)
	
	for code, data in pairs(data2) do
		if not code:find("^[a-z][a-z]$") then
			discrepancy(modname, '%s (<code>%s</code>) does not have a two-letter code.', link(data.canonicalName), code)
		end
		
		check_language(modname, code, data)
	end
	
	-- Check three-letter codes
	for i = string.byte('a'), string.byte('z') do
		local letter = string.char(i)
		local modname = "languages/data3/" .. letter
		local data3 = mw.loadData("Module:" .. modname)
		
		for code, data in pairs(data3) do
			if not code:find("^" .. letter .. "[a-z][a-z]$") then
				discrepancy(modname, '%s (<code>%s</code>) does not have a three-letter code starting with "<code>%s</code>".', link(data.canonicalName), code, letter)
			end
			
			check_language(modname, code, data)
		end
	end
	
	-- Check exceptional codes
	local modname = "languages/datax"
	local datax = mw.loadData("Module:" .. modname)
	
	for code, data in pairs(datax) do
		if code:find("^[a-z][a-z][a-z]?$") then
			discrepancy(modname, '%s (<code>%s</code>) has a two- or three-letter code.', link(data.canonicalName), code)
		end
		
		check_language(modname, code, data)
	end
end

local function check_etym_languages()
	local modname = "etymology languages/data"
	local m_etym_language_data = require("Module:" .. modname) -- no mw.loadData
	local m_language_data = mw.loadData("Module:languages/alldata")
	local m_family_data = mw.loadData('Module:families/data')
	
	local function link(name)
		if not name then
			return "???"
		elseif name:find("[Ll]anguage$") then
			return name
		else
			return name .. " language"
		end
	end
	
	for code, data in pairs(m_etym_language_data) do
		if all_codes[code] then
			discrepancy(modname, "Code <code>%s</code> is not unique, is also defined in [[Module:%s]].", code, all_codes[code])
		else
			all_codes[code] = modname
		end
		
		if not data.canonicalName then
			discrepancy(modname, "Code <code>%s</code> has no canonical name specified.", code)
		elseif language_names[data.canonicalName] then
			--discrepancy(modname, "%s (<code>%s</code>) has a canonical name that is not unique, it is also used by the code <code>%s</code>.", link(data.names[1]), code, language_names[data.names[1]])
		else
			language_names[data.canonicalName] = code
		end
		
		if data.parent then
			if not m_language_data[data.parent] and not m_family_data[data.parent] and not m_etym_language_data[data.parent] then
				discrepancy(modname, "Etymology-only %s (<code>%s</code>) has invalid parent language or family code <code>%s</code>.", link(data.canonicalName), code, data.parent)
			end
			
			nonempty_fams[data.parent] = true
		else
			discrepancy(modname, "Etymology-only %s (<code>%s</code>) has no parent language or family code.", link(data.canonicalName), code)
		end
	end

	local checked = {}
	for code, data in pairs(m_etym_language_data) do
		local stack = {}

		while data do
			if checked[data] then
				break	
			end
			if stack[data] then
				discrepancy(modname, "%s (<code>%s</code>) has a cyclic parental relationship to %s (<code>%s</code>)",
					link(data.canonicalName), code,
					link(m_etym_language_data[data.parent].canonicalName), data.parent
				)
				break
			end
			stack[data] = true
			code, data = data.parent, data.parent and m_etym_language_data[data.parent]
		end
		
		for data in pairs(stack) do
			checked[data] = true	
		end
	end
end

local function check_families()
	local modname = "families/data"
	local m_family_data = mw.loadData("Module:" .. modname)

	local function link(name)
		if not name then
			return "???"
		elseif name:find("[Ll]anguages$") then
			return "[[:Category:" .. name .. "|" .. name .. " family]]"
		else
			return "[[:Category:" .. name .. " languages|" .. name .. " family]]"
		end
	end
	
	for code, data in pairs(m_family_data) do
		if all_codes[code] then
			discrepancy(modname, "Code <code>%s</code> is not unique, is also defined in [[Module:%s]].", code, all_codes[code])
		else
			all_codes[code] = modname
		end
		
		if not data.canonicalName then
			discrepancy(modname, "<code>%s</code> has no canonical name specified.", code)
		elseif family_names[data.canonicalName] then
			discrepancy(modname, "%s (<code>%s</code>) has a canonical name that is not unique, it is also used by the code <code>%s</code>.", link(data.canonicalName), code, family_names[data.canonicalName])
		else
			family_names[data.canonicalName] = code
		end
		
		if data.family then
			if not m_family_data[data.family] then
				discrepancy(modname, "%s (<code>%s</code>) has an invalid parent family code <code>%s</code>.", link(data.canonicalName), code, data.family)
			end
			
			nonempty_fams[data.family] = true
		end
	end
	
	for code, data in pairs(m_family_data) do
		if not nonempty_fams[code] then
			discrepancy(modname, "%s (<code>%s</code>) has no child families or languages.", link(data.canonicalName), code)
		end
	end

	local checked = { ['qfa-not'] = true }
	for code, data in pairs(m_family_data) do
		local stack = {}

		while data do
			if checked[code] then
				break	
			end
			if stack[code] then
				discrepancy(modname, "%s (<code>%s</code>) has a cyclic parental relationship to %s (<code>%s</code>)",
					link(data.canonicalName), code,
					link(m_family_data[data.family].canonicalName), data.family
				)
				break
			end
			stack[code] = true
			code, data = data.family, m_family_data[data.family]
		end
		
		for code in pairs(stack) do
			checked[code] = true	
		end
	end
end

local function check_scripts()
	local modname = "scripts/data"
	local m_script_data = mw.loadData("Module:" .. modname)
	
	local function link(name)
		if not name then
			return "???"
		elseif name:find("[Ss]cript$") then
			return "[[:Category:" .. name .. "|" .. name .. "]]"
		else
			return "[[:Category:" .. name .. " script|" .. name .. " script]]"
		end
	end
	
	for code, data in pairs(m_script_data) do
		if not data.canonicalName then
			discrepancy(modname, "Code <code>%s</code> has no canonical name specified.", code)
		elseif script_names[data.canonicalName] then
			--discrepancy(modname, "%s (<code>%s</code>) has a canonical name that is not unique, it is also used by the code <code>%s</code>.", link(data.names[1]), code, script_names[data.names[1]])
		else
			script_names[data.canonicalName] = code
		end
		
		if not nonempty_scrs[code] then
			discrepancy(modname, "%s (<code>%s</code>) is not used by any language%s.", link(data.canonicalName), code, data.characters and "" or " and has no characters listed for auto-detection")
		end

		if data.characters then
			if not pcall(mw.ustring.find, "", data.characters) then
				discrepancy(modname, "%s (<code>%s</code>) specifies an invalid pattern for character detection: <code>%s</code>", link(data.canonicalName), code, data.characters)
			end
		end
	end
end

function export.perform(frame)
	check_languages()
	check_etym_languages()

	-- families and scripts must be checked AFTER languages; languages checks fill out
	-- the nonempty_fams and nonempty_scrs tables, used for testing if a family/script
	-- is ever used in the data
	check_families()
	check_scripts()
	
	-- Format the messages
	local modnames = {}
	
	for modname, msglist in pairs(messages) do
		table.insert(modnames, modname)
		messages[modname] = '\n===[[Module:' .. modname .. ']]===\n*' .. table.concat(msglist, '\n* ') .. '\n'
	end
	
	table.sort(modnames)
	
	-- Are there any messages?
	if #modnames == 0 then
		return '<b class="success">Glory to Arstotzka.</b>'
	else
		local ret = '<b class="warning">Discrepancies detected:</b>'
		
		for _, modname in ipairs(modnames) do
			ret = ret .. messages[modname]
		end
		
		return ret
	end
end

return export