@@ Line 184: / Line 184: @@
---[[--------------------------< I S _ D O M A I N _ N A M E >--------------------------------------------------
+--[=[-------------------------< I S _ D O M A I N _ N A M E >--------------------------------------------------
 Does this thing that purports to be a domain name seem to be a valid domain name?
+Syntax defined here: http://tools.ietf.org/html/rfc1034#section-3.5
+BNF defined here: https://tools.ietf.org/html/rfc4234
+Single character names are generally reserved; see https://tools.ietf.org/html/draft-ietf-dnsind-iana-dns-01#page-15;
+	see also [[Single-letter second-level domain]]
+list of tlds: https://www.iana.org/domains/root/db
 rfc952 (modified by rfc 1123) requires the first and last character of a hostname to be a letter or a digit.  Between
-the first and last characters the name may use letters, digits, and the hyphen. Single character names are not allowed.
+the first and last characters the name may use letters, digits, and the hyphen.
 Also allowed are IPv4 addresses. IPv6 not supported
-There are three tests: the first is looking for a hostname that is 2 to n letters or digits followed by a dot and a
+domain is expected to be stripped of any path so that the last character in the last character of the tld.  tld
-letter (tld); the second looks for a hostname that is 3 to n characters where the first and last are letters or
+is two or more alpha characters.  Any preceding '//' (from splitting a url with a scheme) will be stripped
-digits and the middle characters are letters, digits, or the hyphen; the whole followed by a dot and a letter or digit.
+here.  Perhaps not necessary but retained incase it is necessary for IPv4 dot decimal.
-The third test is for IPv4 dot-decimal address format; tld not allowed.
+There are several tests:
+	the first character of the whole domain name including subdomains must be a letter or a digit
+	single-letter/digit second-level domains in the .org TLD
+	q, x, and z SL domains in the .com TLD
+	i and q SL domains in the .net TLD
+	single-letter SL domains in the ccTLDs (where the ccTLD is two letters)
+	two-character SL domains in gTLDs (where the gTLD is two or more letters)
+	three-plus-character SL domains in gTLDs (where the gTLD is two or more letters)
+	IPv4 dot-decimal address format; TLD not allowed
 returns true if domain appears to be a proper name and tld or IPv4 address, else false
-]]
+]=]
 local function is_domain_name (domain)
@@ Line 209: / Line 224: @@
 	domain = domain:gsub ('^//', '');											-- strip '//' from domain name if present; done here so we only have to do it once
-	if domain:match ('^[%a%d][%a%d]+%.%a') then									-- two character hostname and tld
+	if not domain:match ('^[%a%d]') then										-- first character must be letter or digit
+		return false;
+	end
+	if domain:match ('%f[%a%d][%a%d]%.org$') then								-- one character .org hostname
+		return true;
+	elseif domain:match ('%f[%a][qxz]%.com$') then								-- assigned one character .com hostname (x.com times out 2015-12-10)
 		return true;
-	elseif domain:match ('^[%a%d][%a%d%-]+[%a%d]%.[%a%d]') then					-- three or more character hostname.hostname or hostname.tld
+	elseif domain:match ('%f[%a][iq]%.net$') then								-- assigned one character .net hostname (q.net registered but not active 2015-12-10)
+		return true;
+	elseif domain:match ('%f[%a%d][%a%d]%.%a%a$') then							-- one character hostname and cctld (2 chars)
+		return true;
+	elseif domain:match ('%f[%a%d][%a%d][%a%d]%.%a%a+$') then					-- two character hostname and tld
+		return true;
+	elseif domain:match ('%f[%a%d][%a%d][%a%d%-]+[%a%d]%.%a%a+$') then			-- three or more character hostname.hostname or hostname.tld
 		return true;
 	elseif domain:match ('^%d%d?%d?%.%d%d?%d?%.%d%d?%d?%.%d%d?%d?') then		-- IPv4 address
@@ Line 242: / Line 269: @@
 --[[--------------------------< S P L I T _ U R L >------------------------------------------------------------
-Split a url into a scheme and domain pair and return both parts.  If protocol relative url, return nil for scheme
+Split a url into a scheme, authority indicator, and domain.
-and domain else return nil for both scheme and domain.
+If protocol relative url, return nil scheme and domain else return nil for both scheme and domain.
+When not protocol relative, get scheme, authority indicator, and domain.  If there is an authority indicator (one
+or more '/' characters following the scheme's colon), make sure that there are only 2.
 ]]
 local function split_url (url_str)
-	local scheme, domain;
+	local scheme, authority, domain;
-	if url_str:match ('%S-:%S+') then											-- if there is what appears to be a scheme domain pair
+	url_str = url_str:gsub ('(%a)/.*', '%1');									-- strip path information (the capture prevents false replacement of '//')
-		scheme, domain = url_str:match ('(%S-:)(%S+)');							-- extract the scheme and domain portions
-	elseif url_str:match ('//%S*') then											-- if there is what appears to be a protocol relative url
+	if url_str:match ('^//%S*') then											-- if there is what appears to be a protocol relative url
-		domain = url_str:match ('//(%S*)')
+		domain = url_str:match ('^//(%S*)')
+	elseif url_str:match ('%S-:/*%S+') then										-- if there is what appears to be a scheme, optional authority indicator, and domain name
+		scheme, authority, domain = url_str:match ('(%S-:)(/*)(%S+)');			-- extract the scheme, authority indicator, and domain portions
+		authority = authority:gsub ('//', '', 1);								-- replace place 1 pair of '/' with nothing;
+		if is_set(authority) then												-- if anything left (1 or 3+ '/' where authority should be) then
+			domain = nil;														-- set to nil which will cause an error message
+		end
 	end
@@ Line 266: / Line 302: @@
 Link parameters are to hold the title of a wikipedia article so none of the WP:TITLESPECIALCHARACTERS are allowed:
 	# < > [ ] | { } _
-except the underscore which is used as a space in wiki urls
+except the underscore which is used as a space in wiki urls and # which is used for section links
 returns false when the value contains any of these characters.
@@ Line 277: / Line 313: @@
 local function link_param_ok (value)
 	local scheme, domain;
-	if value:find ('[#<>%[%]|{}]') then											-- if any prohibited characters
+	if value:find ('[<>%[%]|{}]') then											-- if any prohibited characters
 		return false;
 	end
@@ Line 291: / Line 327: @@
 First we test for space characters.  If any are found, return false.  Then split the url into scheme and domain
-portions, or for protocol relative (//example.com) urls, just the domain.  Use is_scheme() and is_domain() to
+portions, or for protocol relative (//example.com) urls, just the domain.  Use is_url() to validate the two
-validate the two portions of the url.  If both are valid, or for protocol relative if domain is valid, return true, else false.
+portions of the url.  If both are valid, or for protocol relative if domain is valid, return true, else false.
 ]]
@@ Line 310: / Line 346: @@
 Return true if a parameter value has a string that begins and ends with square brackets [ and ] and the first
-characters following the opening bracket obey the rules of a uri scheme (see check_url()).  The test will also find
+non-space characters following the opening bracket appear to be a url.  The test will also find external wikilinks
-external wikilinks that use protocol relative urls. Also finds bare urls.
+that use protocol relative urls. Also finds bare urls.
 The frontier pattern prevents a match on interwiki links which are similar to scheme:path urls.  The tests that
-find bracketed urls are required because the parameters that call this test (currently |title=, |chapter=, and
+find bracketed urls are required because the parameters that call this test (currently |title=, |chapter=, |work=,
-|work=) may have wikilinks and there are articles or redirects like '//Hus' so, while uncommon, |title=[[//Hus]] is
+and |publisher=) may have wikilinks and there are articles or redirects like '//Hus' so, while uncommon, |title=[[//Hus]]
-possible as might be [[en://Hus]].
+is possible as might be [[en://Hus]].
 ]=]
@@ Line 323: / Line 359: @@
 local scheme, domain;
-	if value:match ('%f[%[]%[%a%S*:%S.*%]') then									-- if ext wikilink with scheme and domain: [xxxx://yyyyy.zzz]
+	value = value:gsub ('([^%s/])/%a.*', '%1');									-- strip path information (the capture prevents false replacement of '//')
-		scheme, domain = value:match ('%f[%[]%[(%a%S*:)(%S.*)%]')
-	elseif value:match ('%f[%[]%[//%S*%.%S*%]') then									-- if protocol relative ext wikilink: [//yyyyy.zzz]
+	if value:match ('%f[%[]%[%a%S*:%S+.*%]') then								-- if ext wikilink with scheme and domain: [xxxx://yyyyy.zzz]
-		domain = value:match ('%f[%[]%[//(%S*%.%S*)%]');
+		scheme, domain = value:match ('%f[%[]%[(%a%S*:)(%S+).*%]')
+	elseif value:match ('%f[%[]%[//%S*%.%S+.*%]') then							-- if protocol relative ext wikilink: [//yyyyy.zzz]
+		domain = value:match ('%f[%[]%[//(%S*%.%S+).*%]');
+	elseif value:match ('%a%S*:%S+') then										-- if bare url with scheme; may have leading or trailing plain text
+		scheme, domain = value:match ('(%a%S*:)(%S+)');
+	elseif value:match ('//%S*%.%S+') then										-- if protocol relative bare url: //yyyyy.zzz; may have leading or trailing plain text
+		domain = value:match ('//(%S*%.%S+)');									-- what is left should be the domain
 	else
-		scheme, domain = split_url (value);										-- get scheme or nil and domain or nil from url;
+		return false;															-- didn't find anything that is obviously a url
 	end
@@ Line 351: / Line 393: @@
 		end
 	end
-	if is_set (error_message) then														-- done looping, if there is an error message, display it
+	if is_set (error_message) then												-- done looping, if there is an error message, display it
 		table.insert( z.message_tail, { set_error( 'param_has_ext_link', {error_message}, true ) } );
 	end
@@ Line 620: / Line 662: @@
 		if value ~= nil and selected ~= alias then								-- if we have already selected one of the aliases
 			local skip;
-			for _, v in ipairs(error_list) do											-- spin through the error list to see if we've added this alias
+			for _, v in ipairs(error_list) do									-- spin through the error list to see if we've added this alias
 				if v == alias then
 					skip = true;
-					break;																-- has been added so stop looking
+					break;														-- has been added so stop looking
 				end
 			end
-			if not skip then															-- has not been added so
+			if not skip then													-- has not been added so
-				table.insert( error_list, alias );									-- add error alias to the error list
+				table.insert( error_list, alias );								-- add error alias to the error list
 			end
 		else
@@ Line 726: / Line 768: @@
 end
+--[[--------------------------< H A S _ I N V I S I B L E _ C H A R S >----------------------------------------
+This function searches a parameter's value for nonprintable or invisible characters.  The search stops at the
+first match.
+This function will detect the visible replacement character when it is part of the wikisource.
+Detects but ignores nowiki and math stripmarkers.  Also detects other named stripmarkers (gallery, math, pre, ref)
+and identifies them with a slightly different error message.  See also coins_cleanup().
+Detects but ignores the character pattern that results from the transclusion of {{'}} templates.
+Output of this function is an error message that identifies the character or the Unicode group, or the stripmarker
+that was detected along with its position (or, for multi-byte characters, the position of its first byte) in the
+parameter value.
+]]
+local function has_invisible_chars (param, v)
+	local position = '';														-- position of invisible char or starting position of stripmarker
+	local dummy;																-- end of matching string; not used but required to hold end position when a capture is returned
+	local capture;																-- used by stripmarker detection to hold name of the stripmarker
+	local i=1;
+	local stripmarker, apostrophe;
+	while cfg.invisible_chars[i] do
+		local char=cfg.invisible_chars[i][1]									-- the character or group name
+		local pattern=cfg.invisible_chars[i][2]									-- the pattern used to find it
+		position, dummy, capture = mw.ustring.find (v, pattern)					-- see if the parameter value contains characters that match the pattern
+		if position then
+--			if 'nowiki' == capture or 'math' == capture or ('ref' == capture and 'quote' == param) then 	-- nowiki, math, or quote param and ref stripmarker (not an error condition)
+			if 'nowiki' == capture or 'math' == capture then 					-- nowiki, math stripmarker (not an error condition)
+				stripmarker = true;												-- set a flag
+			elseif true == stripmarker and 'delete' == char then				-- because stripmakers begin and end with the delete char, assume that we've found one end of a stripmarker
+				position = nil;													-- unset
+			elseif 'apostrophe' == char then									-- apostrophe template uses &zwj;, hair space and zero-width space
+				apostrophe = true;
+			elseif true == apostrophe and in_array (char, {'zero width joiner', 'zero width space', 'hair space'}) then
+				position = nil;													-- unset
+			else
+				local err_msg;
+				if capture then
+					err_msg = capture .. ' ' .. char;
+				else
+					err_msg = char .. ' ' .. 'character';
+				end
+				table.insert( z.message_tail, { set_error( 'invisible_char', {err_msg, wrap_style ('parameter', param), position}, true ) } );	-- add error message
+				return;															-- and done with this parameter
+			end
+		end
+		i=i+1;																	-- bump our index
+	end
+end
+--[[--------------------------< A R G U M E N T _ W R A P P E R >----------------------------------------------
+Argument wrapper.  This function provides support for argument mapping defined in the configuration file so that
+multiple names can be transparently aliased to single internal variable.
---[[
-Argument wrapper.  This function provides support for argument
-mapping defined in the configuration file so that multiple names
-can be transparently aliased to single internal variable.
 ]]
@@ Line 775: / Line 874: @@
 end
---[[
+--[[--------------------------< V A L I D A T E >--------------------------------------------------------------
 Looks for a parameter's name in the whitelist.
@@ Line 782: / Line 881: @@
 	false - deprecated, supported parameters
 	nil - unsupported parameters
 ]]
@@ Line 1,904: / Line 2,004: @@
 end
+--[[--------------------------< C O I N S _ C L E A N U P >----------------------------------------------------
+Cleanup parameter values for the metadata by removing or replacing invisible characters and certain html entities.
+-12-10: there is a bug in mw.text.unstripNoWiki ().  It replaced math stripmarkers with the appropriate content
+when it shouldn't.  See https://phabricator.wikimedia.org/T121085 and Wikipedia_talk:Lua#stripmarkers_and_mw.text.unstripNoWiki.28.29
+TODO: move the replacement patterns and replacement values into a table in /Configuration similar to the invisible
+characters table?
+]]
+local function coins_cleanup (value)
+	value = mw.text.unstripNoWiki (value);										-- replace nowiki stripmarkers with their content
+	value = value:gsub ('<span class="nowrap" style="padding%-left:0%.1em;">&#39;s</span>', "'s");	-- replace {{'s}} template with simple apostrophe-s
+	value = value:gsub ('&zwj;\226\128\138\039\226\128\139', "'");				-- replace {{'}} with simple apostrophe
+	value = value:gsub ('\226\128\138\039\226\128\139', "'");					-- replace {{'}} with simple apostrophe (as of 2015-12-11)
+	value = value:gsub ('&nbsp;', ' ');											-- replace &nbsp; entity with plain space
+	value = value:gsub ('\226\128\138', ' ');									-- replace hair space with plain space
+	value = value:gsub ('&zwj;', '');											-- remove &zwj; entities
+	value = value:gsub ('[\226\128\141\226\128\139]', '')						-- remove zero-width joiner, zero-width space
+	value = value:gsub ('[\194\173\009\010\013]', ' ');							-- replace soft hyphen, horizontal tab, line feed, carriage return with plain space
+	return value;
+end
 --[[--------------------------< C O I N S >--------------------------------------------------------------------
@@ Line 1,915: / Line 2,040: @@
 		return '';
 	end
+	for k, v in pairs (data) do													-- spin through all of the metadata parameter values
+		if 'ID_list' ~= k and 'Authors' ~= k then								-- except the ID_list and Author tables (author nowiki stripmarker done when Author table processed)
+			data[k] = coins_cleanup (v);
+		end
+	end
 	local ctx_ver = "Z39.88-2004";
@@ Line 2,013: / Line 2,144: @@
 	local last, first;
 	for k, v in ipairs( data.Authors ) do
-		last, first = v.last, v.first;
+		last, first = coins_cleanup (v.last), coins_cleanup (v.first or '');	-- replace any nowiki strip markers, non-printing or invisible characers
 		if k == 1 then															-- for the first author name only
 			if is_set(last)  and is_set(first) then								-- set these COinS values if |first= and |last= specify the first author name
@@ Line 2,858: / Line 2,989: @@
 				ID = A['Number'];													-- yes, use it
 			else																-- ID has a value so emit error message
---				ID = ID .. " " .. set_error('redundant_parameters', '<code>&#124;id=</code> and <code>&#124;number=</code>');
 				table.insert( z.message_tail, { set_error('redundant_parameters', {wrap_style ('parameter', 'id') .. ' and ' .. wrap_style ('parameter', 'number')}, true )});
 			end
@@ Line 3,872: / Line 4,002: @@
 ]]
+--[[
 local function has_invisible_chars (param, v)
 	local position = '';
@@ Line 3,880: / Line 4,010: @@
 		local char=cfg.invisible_chars[i][1]									-- the character or group name
 		local pattern=cfg.invisible_chars[i][2]									-- the pattern used to find it
+		v = mw.text.unstripNoWiki( v );											-- remove nowiki stripmarkers
 		position = mw.ustring.find (v, pattern)									-- see if the parameter value contains characters that match the pattern
 		if position then
@@ Line 3,888: / Line 4,019: @@
 	end
 end
+]]
 --[[--------------------------< Z . C I T A T I O N >----------------------------------------------------------
@@ Line 3,970: / Line 4,101: @@
 	for k, v in pairs( args ) do
-		has_invisible_chars (k, v)
+		if 'string' == type (k) then											-- don't evaluate positional parameters
+			has_invisible_chars (k, v);
+		end
 	end
 	return citation0( config, args)

Module:Citation/CS1: Difference between revisions

Module:Citation/CS1 (edit)

Revision as of 10:16, 12 December 2015