Module:Citation/CS1: Difference between revisions
Synch from sandbox;
m>Trappist the monk m (change link_param_ok (value) to allow underscore;) |
m>Trappist the monk (Synch from sandbox;) |
||
Line 184: | Line 184: | ||
--[[ | --[=[-------------------------< I S _ D O M A I N _ N A M E >-------------------------------------------------- | ||
Does this thing that purports to be a domain name seem to be a valid domain name? | Does this thing that purports to be a domain name seem to be a valid domain name? | ||
Syntax defined here: http://tools.ietf.org/html/rfc1034#section-3.5 | |||
BNF defined here: https://tools.ietf.org/html/rfc4234 | |||
Single character names are generally reserved; see https://tools.ietf.org/html/draft-ietf-dnsind-iana-dns-01#page-15; | |||
see also [[Single-letter second-level domain]] | |||
list of tlds: https://www.iana.org/domains/root/db | |||
rfc952 (modified by rfc 1123) requires the first and last character of a hostname to be a letter or a digit. Between | rfc952 (modified by rfc 1123) requires the first and last character of a hostname to be a letter or a digit. Between | ||
the first and last characters the name may use letters, digits, and the hyphen | the first and last characters the name may use letters, digits, and the hyphen. | ||
Also allowed are IPv4 addresses. IPv6 not supported | Also allowed are IPv4 addresses. IPv6 not supported | ||
There are | domain is expected to be stripped of any path so that the last character in the last character of the tld. tld | ||
letter ( | is two or more alpha characters. Any preceding '//' (from splitting a url with a scheme) will be stripped | ||
here. Perhaps not necessary but retained incase it is necessary for IPv4 dot decimal. | |||
There are several tests: | |||
the first character of the whole domain name including subdomains must be a letter or a digit | |||
single-letter/digit second-level domains in the .org TLD | |||
q, x, and z SL domains in the .com TLD | |||
i and q SL domains in the .net TLD | |||
single-letter SL domains in the ccTLDs (where the ccTLD is two letters) | |||
two-character SL domains in gTLDs (where the gTLD is two or more letters) | |||
three-plus-character SL domains in gTLDs (where the gTLD is two or more letters) | |||
IPv4 dot-decimal address format; TLD not allowed | |||
returns true if domain appears to be a proper name and tld or IPv4 address, else false | returns true if domain appears to be a proper name and tld or IPv4 address, else false | ||
]] | ]=] | ||
local function is_domain_name (domain) | local function is_domain_name (domain) | ||
Line 209: | Line 224: | ||
domain = domain:gsub ('^//', ''); -- strip '//' from domain name if present; done here so we only have to do it once | domain = domain:gsub ('^//', ''); -- strip '//' from domain name if present; done here so we only have to do it once | ||
if domain:match ('^[%a%d][%a%d] | if not domain:match ('^[%a%d]') then -- first character must be letter or digit | ||
return false; | |||
end | |||
if domain:match ('%f[%a%d][%a%d]%.org$') then -- one character .org hostname | |||
return true; | |||
elseif domain:match ('%f[%a][qxz]%.com$') then -- assigned one character .com hostname (x.com times out 2015-12-10) | |||
return true; | return true; | ||
elseif domain:match (' | elseif domain:match ('%f[%a][iq]%.net$') then -- assigned one character .net hostname (q.net registered but not active 2015-12-10) | ||
return true; | |||
elseif domain:match ('%f[%a%d][%a%d]%.%a%a$') then -- one character hostname and cctld (2 chars) | |||
return true; | |||
elseif domain:match ('%f[%a%d][%a%d][%a%d]%.%a%a+$') then -- two character hostname and tld | |||
return true; | |||
elseif domain:match ('%f[%a%d][%a%d][%a%d%-]+[%a%d]%.%a%a+$') then -- three or more character hostname.hostname or hostname.tld | |||
return true; | return true; | ||
elseif domain:match ('^%d%d?%d?%.%d%d?%d?%.%d%d?%d?%.%d%d?%d?') then -- IPv4 address | elseif domain:match ('^%d%d?%d?%.%d%d?%d?%.%d%d?%d?%.%d%d?%d?') then -- IPv4 address | ||
Line 242: | Line 269: | ||
--[[--------------------------< S P L I T _ U R L >------------------------------------------------------------ | --[[--------------------------< S P L I T _ U R L >------------------------------------------------------------ | ||
Split a url into a scheme and domain | Split a url into a scheme, authority indicator, and domain. | ||
and domain else return nil for both scheme and domain. | If protocol relative url, return nil scheme and domain else return nil for both scheme and domain. | ||
When not protocol relative, get scheme, authority indicator, and domain. If there is an authority indicator (one | |||
or more '/' characters following the scheme's colon), make sure that there are only 2. | |||
]] | ]] | ||
local function split_url (url_str) | local function split_url (url_str) | ||
local scheme, domain; | local scheme, authority, domain; | ||
if url_str:match ('%S-:%S+') then | url_str = url_str:gsub ('(%a)/.*', '%1'); -- strip path information (the capture prevents false replacement of '//') | ||
scheme, domain = url_str:match ('(%S-:)(%S+)'); | |||
if url_str:match ('^//%S*') then -- if there is what appears to be a protocol relative url | |||
domain = url_str:match ('^//(%S*)') | |||
elseif url_str:match ('%S-:/*%S+') then -- if there is what appears to be a scheme, optional authority indicator, and domain name | |||
scheme, authority, domain = url_str:match ('(%S-:)(/*)(%S+)'); -- extract the scheme, authority indicator, and domain portions | |||
authority = authority:gsub ('//', '', 1); -- replace place 1 pair of '/' with nothing; | |||
if is_set(authority) then -- if anything left (1 or 3+ '/' where authority should be) then | |||
domain = nil; -- set to nil which will cause an error message | |||
end | |||
end | end | ||
Line 266: | Line 302: | ||
Link parameters are to hold the title of a wikipedia article so none of the WP:TITLESPECIALCHARACTERS are allowed: | Link parameters are to hold the title of a wikipedia article so none of the WP:TITLESPECIALCHARACTERS are allowed: | ||
# < > [ ] | { } _ | # < > [ ] | { } _ | ||
except the underscore which is used as a space in wiki urls | except the underscore which is used as a space in wiki urls and # which is used for section links | ||
returns false when the value contains any of these characters. | returns false when the value contains any of these characters. | ||
Line 277: | Line 313: | ||
local function link_param_ok (value) | local function link_param_ok (value) | ||
local scheme, domain; | local scheme, domain; | ||
if value:find ('[ | if value:find ('[<>%[%]|{}]') then -- if any prohibited characters | ||
return false; | return false; | ||
end | end | ||
Line 291: | Line 327: | ||
First we test for space characters. If any are found, return false. Then split the url into scheme and domain | First we test for space characters. If any are found, return false. Then split the url into scheme and domain | ||
portions, or for protocol relative (//example.com) urls, just the domain. Use | portions, or for protocol relative (//example.com) urls, just the domain. Use is_url() to validate the two | ||
validate the two portions of the url. If both are valid, or for protocol relative if domain is valid, return true, else false. | portions of the url. If both are valid, or for protocol relative if domain is valid, return true, else false. | ||
]] | ]] | ||
Line 310: | Line 346: | ||
Return true if a parameter value has a string that begins and ends with square brackets [ and ] and the first | Return true if a parameter value has a string that begins and ends with square brackets [ and ] and the first | ||
characters following the opening bracket | non-space characters following the opening bracket appear to be a url. The test will also find external wikilinks | ||
external wikilinks that use protocol relative urls. Also finds bare urls. | that use protocol relative urls. Also finds bare urls. | ||
The frontier pattern prevents a match on interwiki links which are similar to scheme:path urls. The tests that | The frontier pattern prevents a match on interwiki links which are similar to scheme:path urls. The tests that | ||
find bracketed urls are required because the parameters that call this test (currently |title=, |chapter=, and | find bracketed urls are required because the parameters that call this test (currently |title=, |chapter=, |work=, | ||
| | and |publisher=) may have wikilinks and there are articles or redirects like '//Hus' so, while uncommon, |title=[[//Hus]] | ||
possible as might be [[en://Hus]]. | is possible as might be [[en://Hus]]. | ||
]=] | ]=] | ||
Line 323: | Line 359: | ||
local scheme, domain; | local scheme, domain; | ||
if value:match ('%f[%[]%[%a%S*:%S.*%]') then | value = value:gsub ('([^%s/])/%a.*', '%1'); -- strip path information (the capture prevents false replacement of '//') | ||
scheme, domain = value:match ('%f[%[]%[(%a%S*:)(%S.* | |||
elseif value:match ('%f[%[]%[//%S*%.%S*%]') then | if value:match ('%f[%[]%[%a%S*:%S+.*%]') then -- if ext wikilink with scheme and domain: [xxxx://yyyyy.zzz] | ||
domain = value:match ('%f[%[]%[//(%S*%.%S*)% | scheme, domain = value:match ('%f[%[]%[(%a%S*:)(%S+).*%]') | ||
elseif value:match ('%f[%[]%[//%S*%.%S+.*%]') then -- if protocol relative ext wikilink: [//yyyyy.zzz] | |||
domain = value:match ('%f[%[]%[//(%S*%.%S+).*%]'); | |||
elseif value:match ('%a%S*:%S+') then -- if bare url with scheme; may have leading or trailing plain text | |||
scheme, domain = value:match ('(%a%S*:)(%S+)'); | |||
elseif value:match ('//%S*%.%S+') then -- if protocol relative bare url: //yyyyy.zzz; may have leading or trailing plain text | |||
domain = value:match ('//(%S*%.%S+)'); -- what is left should be the domain | |||
else | else | ||
return false; -- didn't find anything that is obviously a url | |||
end | end | ||
Line 351: | Line 393: | ||
end | end | ||
end | end | ||
if is_set (error_message) then | if is_set (error_message) then -- done looping, if there is an error message, display it | ||
table.insert( z.message_tail, { set_error( 'param_has_ext_link', {error_message}, true ) } ); | table.insert( z.message_tail, { set_error( 'param_has_ext_link', {error_message}, true ) } ); | ||
end | end | ||
Line 620: | Line 662: | ||
if value ~= nil and selected ~= alias then -- if we have already selected one of the aliases | if value ~= nil and selected ~= alias then -- if we have already selected one of the aliases | ||
local skip; | local skip; | ||
for _, v in ipairs(error_list) do | for _, v in ipairs(error_list) do -- spin through the error list to see if we've added this alias | ||
if v == alias then | if v == alias then | ||
skip = true; | skip = true; | ||
break; | break; -- has been added so stop looking | ||
end | end | ||
end | end | ||
if not skip then | if not skip then -- has not been added so | ||
table.insert( error_list, alias ); | table.insert( error_list, alias ); -- add error alias to the error list | ||
end | end | ||
else | else | ||
Line 726: | Line 768: | ||
end | end | ||
--[[--------------------------< H A S _ I N V I S I B L E _ C H A R S >---------------------------------------- | |||
This function searches a parameter's value for nonprintable or invisible characters. The search stops at the | |||
first match. | |||
This function will detect the visible replacement character when it is part of the wikisource. | |||
Detects but ignores nowiki and math stripmarkers. Also detects other named stripmarkers (gallery, math, pre, ref) | |||
and identifies them with a slightly different error message. See also coins_cleanup(). | |||
Detects but ignores the character pattern that results from the transclusion of {{'}} templates. | |||
Output of this function is an error message that identifies the character or the Unicode group, or the stripmarker | |||
that was detected along with its position (or, for multi-byte characters, the position of its first byte) in the | |||
parameter value. | |||
]] | |||
local function has_invisible_chars (param, v) | |||
local position = ''; -- position of invisible char or starting position of stripmarker | |||
local dummy; -- end of matching string; not used but required to hold end position when a capture is returned | |||
local capture; -- used by stripmarker detection to hold name of the stripmarker | |||
local i=1; | |||
local stripmarker, apostrophe; | |||
while cfg.invisible_chars[i] do | |||
local char=cfg.invisible_chars[i][1] -- the character or group name | |||
local pattern=cfg.invisible_chars[i][2] -- the pattern used to find it | |||
position, dummy, capture = mw.ustring.find (v, pattern) -- see if the parameter value contains characters that match the pattern | |||
if position then | |||
-- if 'nowiki' == capture or 'math' == capture or ('ref' == capture and 'quote' == param) then -- nowiki, math, or quote param and ref stripmarker (not an error condition) | |||
if 'nowiki' == capture or 'math' == capture then -- nowiki, math stripmarker (not an error condition) | |||
stripmarker = true; -- set a flag | |||
elseif true == stripmarker and 'delete' == char then -- because stripmakers begin and end with the delete char, assume that we've found one end of a stripmarker | |||
position = nil; -- unset | |||
elseif 'apostrophe' == char then -- apostrophe template uses ‍, hair space and zero-width space | |||
apostrophe = true; | |||
elseif true == apostrophe and in_array (char, {'zero width joiner', 'zero width space', 'hair space'}) then | |||
position = nil; -- unset | |||
else | |||
local err_msg; | |||
if capture then | |||
err_msg = capture .. ' ' .. char; | |||
else | |||
err_msg = char .. ' ' .. 'character'; | |||
end | |||
table.insert( z.message_tail, { set_error( 'invisible_char', {err_msg, wrap_style ('parameter', param), position}, true ) } ); -- add error message | |||
return; -- and done with this parameter | |||
end | |||
end | |||
i=i+1; -- bump our index | |||
end | |||
end | |||
--[[--------------------------< A R G U M E N T _ W R A P P E R >---------------------------------------------- | |||
Argument wrapper. This function provides support for argument mapping defined in the configuration file so that | |||
multiple names can be transparently aliased to single internal variable. | |||
]] | ]] | ||
Line 775: | Line 874: | ||
end | end | ||
--[[ | --[[--------------------------< V A L I D A T E >-------------------------------------------------------------- | ||
Looks for a parameter's name in the whitelist. | Looks for a parameter's name in the whitelist. | ||
Line 782: | Line 881: | ||
false - deprecated, supported parameters | false - deprecated, supported parameters | ||
nil - unsupported parameters | nil - unsupported parameters | ||
]] | ]] | ||
Line 1,904: | Line 2,004: | ||
end | end | ||
--[[--------------------------< C O I N S _ C L E A N U P >---------------------------------------------------- | |||
Cleanup parameter values for the metadata by removing or replacing invisible characters and certain html entities. | |||
2015-12-10: there is a bug in mw.text.unstripNoWiki (). It replaced math stripmarkers with the appropriate content | |||
when it shouldn't. See https://phabricator.wikimedia.org/T121085 and Wikipedia_talk:Lua#stripmarkers_and_mw.text.unstripNoWiki.28.29 | |||
TODO: move the replacement patterns and replacement values into a table in /Configuration similar to the invisible | |||
characters table? | |||
]] | |||
local function coins_cleanup (value) | |||
value = mw.text.unstripNoWiki (value); -- replace nowiki stripmarkers with their content | |||
value = value:gsub ('<span class="nowrap" style="padding%-left:0%.1em;">'s</span>', "'s"); -- replace {{'s}} template with simple apostrophe-s | |||
value = value:gsub ('‍\226\128\138\039\226\128\139', "'"); -- replace {{'}} with simple apostrophe | |||
value = value:gsub ('\226\128\138\039\226\128\139', "'"); -- replace {{'}} with simple apostrophe (as of 2015-12-11) | |||
value = value:gsub (' ', ' '); -- replace entity with plain space | |||
value = value:gsub ('\226\128\138', ' '); -- replace hair space with plain space | |||
value = value:gsub ('‍', ''); -- remove ‍ entities | |||
value = value:gsub ('[\226\128\141\226\128\139]', '') -- remove zero-width joiner, zero-width space | |||
value = value:gsub ('[\194\173\009\010\013]', ' '); -- replace soft hyphen, horizontal tab, line feed, carriage return with plain space | |||
return value; | |||
end | |||
--[[--------------------------< C O I N S >-------------------------------------------------------------------- | --[[--------------------------< C O I N S >-------------------------------------------------------------------- | ||
Line 1,915: | Line 2,040: | ||
return ''; | return ''; | ||
end | end | ||
for k, v in pairs (data) do -- spin through all of the metadata parameter values | |||
if 'ID_list' ~= k and 'Authors' ~= k then -- except the ID_list and Author tables (author nowiki stripmarker done when Author table processed) | |||
data[k] = coins_cleanup (v); | |||
end | |||
end | |||
local ctx_ver = "Z39.88-2004"; | local ctx_ver = "Z39.88-2004"; | ||
Line 2,013: | Line 2,144: | ||
local last, first; | local last, first; | ||
for k, v in ipairs( data.Authors ) do | for k, v in ipairs( data.Authors ) do | ||
last, first = v.last, v.first; | last, first = coins_cleanup (v.last), coins_cleanup (v.first or ''); -- replace any nowiki strip markers, non-printing or invisible characers | ||
if k == 1 then -- for the first author name only | if k == 1 then -- for the first author name only | ||
if is_set(last) and is_set(first) then -- set these COinS values if |first= and |last= specify the first author name | if is_set(last) and is_set(first) then -- set these COinS values if |first= and |last= specify the first author name | ||
Line 2,858: | Line 2,989: | ||
ID = A['Number']; -- yes, use it | ID = A['Number']; -- yes, use it | ||
else -- ID has a value so emit error message | else -- ID has a value so emit error message | ||
table.insert( z.message_tail, { set_error('redundant_parameters', {wrap_style ('parameter', 'id') .. ' and ' .. wrap_style ('parameter', 'number')}, true )}); | table.insert( z.message_tail, { set_error('redundant_parameters', {wrap_style ('parameter', 'id') .. ' and ' .. wrap_style ('parameter', 'number')}, true )}); | ||
end | end | ||
Line 3,872: | Line 4,002: | ||
]] | ]] | ||
--[[ | |||
local function has_invisible_chars (param, v) | local function has_invisible_chars (param, v) | ||
local position = ''; | local position = ''; | ||
Line 3,880: | Line 4,010: | ||
local char=cfg.invisible_chars[i][1] -- the character or group name | local char=cfg.invisible_chars[i][1] -- the character or group name | ||
local pattern=cfg.invisible_chars[i][2] -- the pattern used to find it | local pattern=cfg.invisible_chars[i][2] -- the pattern used to find it | ||
v = mw.text.unstripNoWiki( v ); -- remove nowiki stripmarkers | |||
position = mw.ustring.find (v, pattern) -- see if the parameter value contains characters that match the pattern | position = mw.ustring.find (v, pattern) -- see if the parameter value contains characters that match the pattern | ||
if position then | if position then | ||
Line 3,888: | Line 4,019: | ||
end | end | ||
end | end | ||
]] | |||
--[[--------------------------< Z . C I T A T I O N >---------------------------------------------------------- | --[[--------------------------< Z . C I T A T I O N >---------------------------------------------------------- | ||
Line 3,970: | Line 4,101: | ||
for k, v in pairs( args ) do | for k, v in pairs( args ) do | ||
has_invisible_chars (k, v) | if 'string' == type (k) then -- don't evaluate positional parameters | ||
has_invisible_chars (k, v); | |||
end | |||
end | end | ||
return citation0( config, args) | return citation0( config, args) |