Module:Citation/CS1: Difference between revisions

Synch from sandbox;
m>Trappist the monk
m (change link_param_ok (value) to allow underscore;)
m>Trappist the monk
(Synch from sandbox;)
Line 184: Line 184:




--[[--------------------------< I S _ D O M A I N _ N A M E >--------------------------------------------------
--[=[-------------------------< I S _ D O M A I N _ N A M E >--------------------------------------------------


Does this thing that purports to be a domain name seem to be a valid domain name?
Does this thing that purports to be a domain name seem to be a valid domain name?
Syntax defined here: http://tools.ietf.org/html/rfc1034#section-3.5
BNF defined here: https://tools.ietf.org/html/rfc4234
Single character names are generally reserved; see https://tools.ietf.org/html/draft-ietf-dnsind-iana-dns-01#page-15;
see also [[Single-letter second-level domain]]
list of tlds: https://www.iana.org/domains/root/db


rfc952 (modified by rfc 1123) requires the first and last character of a hostname to be a letter or a digit.  Between
rfc952 (modified by rfc 1123) requires the first and last character of a hostname to be a letter or a digit.  Between
the first and last characters the name may use letters, digits, and the hyphen. Single character names are not allowed.
the first and last characters the name may use letters, digits, and the hyphen.


Also allowed are IPv4 addresses. IPv6 not supported
Also allowed are IPv4 addresses. IPv6 not supported


There are three tests: the first is looking for a hostname that is 2 to n letters or digits followed by a dot and a
domain is expected to be stripped of any path so that the last character in the last character of the tld.  tld
letter (tld); the second looks for a hostname that is 3 to n characters where the first and last are letters or
is two or more alpha characters.  Any preceding '//' (from splitting a url with a scheme) will be stripped
digits and the middle characters are letters, digits, or the hyphen; the whole followed by a dot and a letter or digit.
here.  Perhaps not necessary but retained incase it is necessary for IPv4 dot decimal.
The third test is for IPv4 dot-decimal address format; tld not allowed.
 
There are several tests:
the first character of the whole domain name including subdomains must be a letter or a digit
single-letter/digit second-level domains in the .org TLD
q, x, and z SL domains in the .com TLD
i and q SL domains in the .net TLD
single-letter SL domains in the ccTLDs (where the ccTLD is two letters)
two-character SL domains in gTLDs (where the gTLD is two or more letters)
three-plus-character SL domains in gTLDs (where the gTLD is two or more letters)
IPv4 dot-decimal address format; TLD not allowed


returns true if domain appears to be a proper name and tld or IPv4 address, else false
returns true if domain appears to be a proper name and tld or IPv4 address, else false


]]
]=]


local function is_domain_name (domain)
local function is_domain_name (domain)
Line 209: Line 224:
domain = domain:gsub ('^//', ''); -- strip '//' from domain name if present; done here so we only have to do it once
domain = domain:gsub ('^//', ''); -- strip '//' from domain name if present; done here so we only have to do it once
if domain:match ('^[%a%d][%a%d]+%.%a') then -- two character hostname and tld
if not domain:match ('^[%a%d]') then -- first character must be letter or digit
return false;
end
if domain:match ('%f[%a%d][%a%d]%.org$') then -- one character .org hostname
return true;
elseif domain:match ('%f[%a][qxz]%.com$') then -- assigned one character .com hostname (x.com times out 2015-12-10)
return true;
return true;
elseif domain:match ('^[%a%d][%a%d%-]+[%a%d]%.[%a%d]') then -- three or more character hostname.hostname or hostname.tld
elseif domain:match ('%f[%a][iq]%.net$') then -- assigned one character .net hostname (q.net registered but not active 2015-12-10)
return true;
elseif domain:match ('%f[%a%d][%a%d]%.%a%a$') then -- one character hostname and cctld (2 chars)
return true;
elseif domain:match ('%f[%a%d][%a%d][%a%d]%.%a%a+$') then -- two character hostname and tld
return true;
elseif domain:match ('%f[%a%d][%a%d][%a%d%-]+[%a%d]%.%a%a+$') then -- three or more character hostname.hostname or hostname.tld
return true;
return true;
elseif domain:match ('^%d%d?%d?%.%d%d?%d?%.%d%d?%d?%.%d%d?%d?') then -- IPv4 address
elseif domain:match ('^%d%d?%d?%.%d%d?%d?%.%d%d?%d?%.%d%d?%d?') then -- IPv4 address
Line 242: Line 269:
--[[--------------------------< S P L I T _ U R L >------------------------------------------------------------
--[[--------------------------< S P L I T _ U R L >------------------------------------------------------------


Split a url into a scheme and domain pair and return both parts. If protocol relative url, return nil for scheme
Split a url into a scheme, authority indicator, and domain.
and domain else return nil for both scheme and domain.
If protocol relative url, return nil scheme and domain else return nil for both scheme and domain.
 
When not protocol relative, get scheme, authority indicator, and domain.  If there is an authority indicator (one
or more '/' characters following the scheme's colon), make sure that there are only 2.


]]
]]


local function split_url (url_str)
local function split_url (url_str)
local scheme, domain;
local scheme, authority, domain;
if url_str:match ('%S-:%S+') then -- if there is what appears to be a scheme domain pair
url_str = url_str:gsub ('(%a)/.*', '%1'); -- strip path information (the capture prevents false replacement of '//')
scheme, domain = url_str:match ('(%S-:)(%S+)'); -- extract the scheme and domain portions
 
elseif url_str:match ('//%S*') then -- if there is what appears to be a protocol relative url
if url_str:match ('^//%S*') then -- if there is what appears to be a protocol relative url
domain = url_str:match ('//(%S*)')
domain = url_str:match ('^//(%S*)')
elseif url_str:match ('%S-:/*%S+') then -- if there is what appears to be a scheme, optional authority indicator, and domain name
scheme, authority, domain = url_str:match ('(%S-:)(/*)(%S+)'); -- extract the scheme, authority indicator, and domain portions
authority = authority:gsub ('//', '', 1); -- replace place 1 pair of '/' with nothing;
if is_set(authority) then -- if anything left (1 or 3+ '/' where authority should be) then
domain = nil; -- set to nil which will cause an error message
end
end
end
Line 266: Line 302:
Link parameters are to hold the title of a wikipedia article so none of the WP:TITLESPECIALCHARACTERS are allowed:
Link parameters are to hold the title of a wikipedia article so none of the WP:TITLESPECIALCHARACTERS are allowed:
# < > [ ] | { } _
# < > [ ] | { } _
except the underscore which is used as a space in wiki urls
except the underscore which is used as a space in wiki urls and # which is used for section links


returns false when the value contains any of these characters.
returns false when the value contains any of these characters.
Line 277: Line 313:
local function link_param_ok (value)
local function link_param_ok (value)
local scheme, domain;
local scheme, domain;
if value:find ('[#<>%[%]|{}]') then -- if any prohibited characters
if value:find ('[<>%[%]|{}]') then -- if any prohibited characters
return false;
return false;
end
end
Line 291: Line 327:


First we test for space characters.  If any are found, return false.  Then split the url into scheme and domain
First we test for space characters.  If any are found, return false.  Then split the url into scheme and domain
portions, or for protocol relative (//example.com) urls, just the domain.  Use is_scheme() and is_domain() to
portions, or for protocol relative (//example.com) urls, just the domain.  Use is_url() to validate the two
validate the two portions of the url.  If both are valid, or for protocol relative if domain is valid, return true, else false.
portions of the url.  If both are valid, or for protocol relative if domain is valid, return true, else false.


]]
]]
Line 310: Line 346:


Return true if a parameter value has a string that begins and ends with square brackets [ and ] and the first
Return true if a parameter value has a string that begins and ends with square brackets [ and ] and the first
characters following the opening bracket obey the rules of a uri scheme (see check_url()).  The test will also find
non-space characters following the opening bracket appear to be a url.  The test will also find external wikilinks
external wikilinks that use protocol relative urls. Also finds bare urls.
that use protocol relative urls. Also finds bare urls.


The frontier pattern prevents a match on interwiki links which are similar to scheme:path urls.  The tests that
The frontier pattern prevents a match on interwiki links which are similar to scheme:path urls.  The tests that
find bracketed urls are required because the parameters that call this test (currently |title=, |chapter=, and
find bracketed urls are required because the parameters that call this test (currently |title=, |chapter=, |work=,
|work=) may have wikilinks and there are articles or redirects like '//Hus' so, while uncommon, |title=[[//Hus]] is
and |publisher=) may have wikilinks and there are articles or redirects like '//Hus' so, while uncommon, |title=[[//Hus]]
possible as might be [[en://Hus]].
is possible as might be [[en://Hus]].


]=]
]=]
Line 323: Line 359:
local scheme, domain;
local scheme, domain;


if value:match ('%f[%[]%[%a%S*:%S.*%]') then -- if ext wikilink with scheme and domain: [xxxx://yyyyy.zzz]
value = value:gsub ('([^%s/])/%a.*', '%1'); -- strip path information (the capture prevents false replacement of '//')
scheme, domain = value:match ('%f[%[]%[(%a%S*:)(%S.*)%]')
 
elseif value:match ('%f[%[]%[//%S*%.%S*%]') then -- if protocol relative ext wikilink: [//yyyyy.zzz]
if value:match ('%f[%[]%[%a%S*:%S+.*%]') then -- if ext wikilink with scheme and domain: [xxxx://yyyyy.zzz]
domain = value:match ('%f[%[]%[//(%S*%.%S*)%]');
scheme, domain = value:match ('%f[%[]%[(%a%S*:)(%S+).*%]')
elseif value:match ('%f[%[]%[//%S*%.%S+.*%]') then -- if protocol relative ext wikilink: [//yyyyy.zzz]
domain = value:match ('%f[%[]%[//(%S*%.%S+).*%]');
elseif value:match ('%a%S*:%S+') then -- if bare url with scheme; may have leading or trailing plain text
scheme, domain = value:match ('(%a%S*:)(%S+)');
elseif value:match ('//%S*%.%S+') then -- if protocol relative bare url: //yyyyy.zzz; may have leading or trailing plain text
domain = value:match ('//(%S*%.%S+)'); -- what is left should be the domain
else
else
scheme, domain = split_url (value); -- get scheme or nil and domain or nil from url;
return false; -- didn't find anything that is obviously a url
end
end


Line 351: Line 393:
end
end
end
end
if is_set (error_message) then -- done looping, if there is an error message, display it
if is_set (error_message) then -- done looping, if there is an error message, display it
table.insert( z.message_tail, { set_error( 'param_has_ext_link', {error_message}, true ) } );
table.insert( z.message_tail, { set_error( 'param_has_ext_link', {error_message}, true ) } );
end
end
Line 620: Line 662:
if value ~= nil and selected ~= alias then -- if we have already selected one of the aliases
if value ~= nil and selected ~= alias then -- if we have already selected one of the aliases
local skip;
local skip;
for _, v in ipairs(error_list) do -- spin through the error list to see if we've added this alias
for _, v in ipairs(error_list) do -- spin through the error list to see if we've added this alias
if v == alias then
if v == alias then
skip = true;
skip = true;
break; -- has been added so stop looking  
break; -- has been added so stop looking  
end
end
end
end
if not skip then -- has not been added so
if not skip then -- has not been added so
table.insert( error_list, alias ); -- add error alias to the error list
table.insert( error_list, alias ); -- add error alias to the error list
end
end
else
else
Line 726: Line 768:
end
end


--[[--------------------------< H A S _ I N V I S I B L E _ C H A R S >----------------------------------------
This function searches a parameter's value for nonprintable or invisible characters.  The search stops at the
first match.
This function will detect the visible replacement character when it is part of the wikisource.
Detects but ignores nowiki and math stripmarkers.  Also detects other named stripmarkers (gallery, math, pre, ref)
and identifies them with a slightly different error message.  See also coins_cleanup().
Detects but ignores the character pattern that results from the transclusion of {{'}} templates.
Output of this function is an error message that identifies the character or the Unicode group, or the stripmarker
that was detected along with its position (or, for multi-byte characters, the position of its first byte) in the
parameter value.
]]
local function has_invisible_chars (param, v)
local position = ''; -- position of invisible char or starting position of stripmarker
local dummy; -- end of matching string; not used but required to hold end position when a capture is returned
local capture; -- used by stripmarker detection to hold name of the stripmarker
local i=1;
local stripmarker, apostrophe;
while cfg.invisible_chars[i] do
local char=cfg.invisible_chars[i][1] -- the character or group name
local pattern=cfg.invisible_chars[i][2] -- the pattern used to find it
position, dummy, capture = mw.ustring.find (v, pattern) -- see if the parameter value contains characters that match the pattern
if position then
-- if 'nowiki' == capture or 'math' == capture or ('ref' == capture and 'quote' == param) then -- nowiki, math, or quote param and ref stripmarker (not an error condition)
if 'nowiki' == capture or 'math' == capture then -- nowiki, math stripmarker (not an error condition)
stripmarker = true; -- set a flag
elseif true == stripmarker and 'delete' == char then -- because stripmakers begin and end with the delete char, assume that we've found one end of a stripmarker
position = nil; -- unset
elseif 'apostrophe' == char then -- apostrophe template uses &zwj;, hair space and zero-width space
apostrophe = true;
elseif true == apostrophe and in_array (char, {'zero width joiner', 'zero width space', 'hair space'}) then
position = nil; -- unset
else
local err_msg;
if capture then
err_msg = capture .. ' ' .. char;
else
err_msg = char .. ' ' .. 'character';
end
table.insert( z.message_tail, { set_error( 'invisible_char', {err_msg, wrap_style ('parameter', param), position}, true ) } ); -- add error message
return; -- and done with this parameter
end
end
i=i+1; -- bump our index
end
end
--[[--------------------------< A R G U M E N T _ W R A P P E R >----------------------------------------------
Argument wrapper.  This function provides support for argument mapping defined in the configuration file so that
multiple names can be transparently aliased to single internal variable.


--[[
Argument wrapper.  This function provides support for argument
mapping defined in the configuration file so that multiple names
can be transparently aliased to single internal variable.
]]
]]


Line 775: Line 874:
end
end


--[[
--[[--------------------------< V A L I D A T E >--------------------------------------------------------------
Looks for a parameter's name in the whitelist.
Looks for a parameter's name in the whitelist.


Line 782: Line 881:
false - deprecated, supported parameters
false - deprecated, supported parameters
nil - unsupported parameters
nil - unsupported parameters
]]
]]


Line 1,904: Line 2,004:
end
end
    
    
--[[--------------------------< C O I N S _ C L E A N U P >----------------------------------------------------
Cleanup parameter values for the metadata by removing or replacing invisible characters and certain html entities.
2015-12-10: there is a bug in mw.text.unstripNoWiki ().  It replaced math stripmarkers with the appropriate content
when it shouldn't.  See https://phabricator.wikimedia.org/T121085 and Wikipedia_talk:Lua#stripmarkers_and_mw.text.unstripNoWiki.28.29
TODO: move the replacement patterns and replacement values into a table in /Configuration similar to the invisible
characters table?
]]
local function coins_cleanup (value)
value = mw.text.unstripNoWiki (value); -- replace nowiki stripmarkers with their content
value = value:gsub ('<span class="nowrap" style="padding%-left:0%.1em;">&#39;s</span>', "'s"); -- replace {{'s}} template with simple apostrophe-s
value = value:gsub ('&zwj;\226\128\138\039\226\128\139', "'"); -- replace {{'}} with simple apostrophe
value = value:gsub ('\226\128\138\039\226\128\139', "'"); -- replace {{'}} with simple apostrophe (as of 2015-12-11)
value = value:gsub ('&nbsp;', ' '); -- replace &nbsp; entity with plain space
value = value:gsub ('\226\128\138', ' '); -- replace hair space with plain space
value = value:gsub ('&zwj;', ''); -- remove &zwj; entities
value = value:gsub ('[\226\128\141\226\128\139]', '') -- remove zero-width joiner, zero-width space
value = value:gsub ('[\194\173\009\010\013]', ' '); -- replace soft hyphen, horizontal tab, line feed, carriage return with plain space
return value;
end


--[[--------------------------< C O I N S >--------------------------------------------------------------------
--[[--------------------------< C O I N S >--------------------------------------------------------------------
Line 1,915: Line 2,040:
return '';
return '';
end
end
 
for k, v in pairs (data) do -- spin through all of the metadata parameter values
if 'ID_list' ~= k and 'Authors' ~= k then -- except the ID_list and Author tables (author nowiki stripmarker done when Author table processed)
data[k] = coins_cleanup (v);
end
end
 
local ctx_ver = "Z39.88-2004";
local ctx_ver = "Z39.88-2004";
Line 2,013: Line 2,144:
local last, first;
local last, first;
for k, v in ipairs( data.Authors ) do
for k, v in ipairs( data.Authors ) do
last, first = v.last, v.first;
last, first = coins_cleanup (v.last), coins_cleanup (v.first or ''); -- replace any nowiki strip markers, non-printing or invisible characers
if k == 1 then -- for the first author name only
if k == 1 then -- for the first author name only
if is_set(last)  and is_set(first) then -- set these COinS values if |first= and |last= specify the first author name
if is_set(last)  and is_set(first) then -- set these COinS values if |first= and |last= specify the first author name
Line 2,858: Line 2,989:
ID = A['Number']; -- yes, use it
ID = A['Number']; -- yes, use it
else -- ID has a value so emit error message
else -- ID has a value so emit error message
-- ID = ID .. " " .. set_error('redundant_parameters', '<code>&#124;id=</code> and <code>&#124;number=</code>');
table.insert( z.message_tail, { set_error('redundant_parameters', {wrap_style ('parameter', 'id') .. ' and ' .. wrap_style ('parameter', 'number')}, true )});
table.insert( z.message_tail, { set_error('redundant_parameters', {wrap_style ('parameter', 'id') .. ' and ' .. wrap_style ('parameter', 'number')}, true )});
end
end
Line 3,872: Line 4,002:


]]
]]
 
--[[
local function has_invisible_chars (param, v)
local function has_invisible_chars (param, v)
local position = '';
local position = '';
Line 3,880: Line 4,010:
local char=cfg.invisible_chars[i][1] -- the character or group name
local char=cfg.invisible_chars[i][1] -- the character or group name
local pattern=cfg.invisible_chars[i][2] -- the pattern used to find it
local pattern=cfg.invisible_chars[i][2] -- the pattern used to find it
v = mw.text.unstripNoWiki( v ); -- remove nowiki stripmarkers
position = mw.ustring.find (v, pattern) -- see if the parameter value contains characters that match the pattern
position = mw.ustring.find (v, pattern) -- see if the parameter value contains characters that match the pattern
if position then
if position then
Line 3,888: Line 4,019:
end
end
end
end
 
]]


--[[--------------------------< Z . C I T A T I O N >----------------------------------------------------------
--[[--------------------------< Z . C I T A T I O N >----------------------------------------------------------
Line 3,970: Line 4,101:


for k, v in pairs( args ) do
for k, v in pairs( args ) do
has_invisible_chars (k, v)
if 'string' == type (k) then -- don't evaluate positional parameters
has_invisible_chars (k, v);
end
end
end
return citation0( config, args)
return citation0( config, args)
Anonymous user