Module:Citation/CS1/COinS: Difference between revisions

    m (31 revisions imported from templatewiki:Module:Citation/CS1/COinS)
    (sync from sandbox;)
    Line 1: Line 1:
    local coins = {};


    --[[--------------------------< F O R W A R D  D E C L A R A T I O N S >--------------------------------------
    --[[--------------------------< F O R W A R D  D E C L A R A T I O N S >--------------------------------------
    ]]
    ]]
    local is_set, in_array, remove_wiki_link; -- functions in Module:Citation/CS1/Utilities
     
    local is_set, in_array, remove_wiki_link, strip_apostrophe_markup; -- functions in Module:Citation/CS1/Utilities


    local cfg; -- table of configuration tables that are defined in Module:Citation/CS1/Configuration
    local cfg; -- table of configuration tables that are defined in Module:Citation/CS1/Configuration
    --[[--------------------------< S T R I P _ A P O S T R O P H E _ M A R K U P >--------------------------------
    Strip wiki italic and bold markup from argument so that it doesn't contaminate COinS metadata.
    This function strips common patterns of apostrophe markup.  We presume that editors who have taken the time to
    markup a title have, as a result, provided valid markup. When they don't, some single apostrophes are left behind.
    ]]
    local function strip_apostrophe_markup (argument)
    if not is_set (argument) then return argument; end
    if argument:find ( "''", 1, true ) == nil then -- Is there at least one double apostrophe?  If not, exit.
    return argument;
    end
    while true do
    if argument:find ( "'''''", 1, true ) then -- bold italic (5)
    argument=argument:gsub("%'%'%'%'%'", ""); -- remove all instances of it
    elseif argument:find ( "''''", 1, true ) then -- italic start and end without content (4)
    argument=argument:gsub("%'%'%'%'", "");
    elseif argument:find ( "'''", 1, true ) then -- bold (3)
    argument=argument:gsub("%'%'%'", "");
    elseif argument:find ( "''", 1, true ) then -- italic (2)
    argument=argument:gsub("%'%'", "");
    else
    break;
    end
    end
    return argument; -- done
    end




    Line 55: Line 21:
    title = strip_apostrophe_markup (title); -- strip any apostrophe markup
    title = strip_apostrophe_markup (title); -- strip any apostrophe markup
    else
    else
    title=''; -- if not set, make sure title is an empty string
    title = ''; -- if not set, make sure title is an empty string
    end
    end
    if is_set (script) then
    if is_set (script) then
    Line 61: Line 27:
    script = strip_apostrophe_markup (script); -- strip any apostrophe markup
    script = strip_apostrophe_markup (script); -- strip any apostrophe markup
    else
    else
    script=''; -- if not set, make sure script is an empty string
    script = ''; -- if not set, make sure script is an empty string
    end
    end
    if is_set (title) and is_set (script) then
    if is_set (title) and is_set (script) then
    Line 72: Line 38:
    --[[--------------------------< E S C A P E _ L U A _ M A G I C _ C H A R S >----------------------------------
    --[[--------------------------< E S C A P E _ L U A _ M A G I C _ C H A R S >----------------------------------


    Returns a string where all of lua's magic characters have been escaped.  This is important because functions like
    Returns a string where all of Lua's magic characters have been escaped.  This is important because functions like
    string.gsub() treat their pattern and replace strings as patterns, not literal strings.
    string.gsub() treat their pattern and replace strings as patterns, not literal strings.
    ]]
    ]]
    Line 78: Line 44:
    local function escape_lua_magic_chars (argument)
    local function escape_lua_magic_chars (argument)
    argument = argument:gsub("%%", "%%%%"); -- replace % with %%
    argument = argument:gsub("%%", "%%%%"); -- replace % with %%
    argument = argument:gsub("([%^%$%(%)%.%[%]%*%+%-%?])", "%%%1"); -- replace all other lua magic pattern characters
    argument = argument:gsub("([%^%$%(%)%.%[%]%*%+%-%?])", "%%%1"); -- replace all other Lua magic pattern characters
    return argument;
    return argument;
    end
    end
    Line 94: Line 60:
    while true do
    while true do
    pattern = pages:match("%[(%w*:?//[^ ]+%s+)[%w%d].*%]"); -- pattern is the opening bracket, the url and following space(s): "[url "
    pattern = pages:match("%[(%w*:?//[^ ]+%s+)[%w%d].*%]"); -- pattern is the opening bracket, the URL and following space(s): "[url "
    if nil == pattern then break; end -- no more urls
    if nil == pattern then break; end -- no more URLs
    pattern = escape_lua_magic_chars (pattern); -- pattern is not a literal string; escape lua's magic pattern characters
    pattern = escape_lua_magic_chars (pattern); -- pattern is not a literal string; escape Lua's magic pattern characters
    pages = pages:gsub(pattern, ""); -- remove as many instances of pattern as possible
    pages = pages:gsub(pattern, ""); -- remove as many instances of pattern as possible
    end
    end
    pages = pages:gsub("[%[%]]", ""); -- remove the brackets
    pages = pages:gsub("[%[%]]", ""); -- remove the brackets
    pages = pages:gsub("–", "-" ); -- replace endashes with hyphens
    pages = pages:gsub("–", "-" ); -- replace endashes with hyphens
    pages = pages:gsub("&%w+;", "-" ); -- and replace html entities (&ndash; etc.) with hyphens; do we need to replace numerical entities like &#32; and the like?
    pages = pages:gsub("&%w+;", "-" ); -- and replace HTML entities (&ndash; etc.) with hyphens; do we need to replace numerical entities like &#32; and the like?
    return pages;
    return pages;
    end
    end
    Line 114: Line 80:
    MathML with SVG or PNG fallback
    MathML with SVG or PNG fallback


    All three are heavy with html and css which doesn't belong in the metadata.
    All three are heavy with HTML and CSS which doesn't belong in the metadata.


    Without this function, the metadata saved in the raw wikitext contained the rendering determined by the settings
    Without this function, the metadata saved in the raw wikitext contained the rendering determined by the settings
    Line 121: Line 87:
    This function gets the rendered form of an equation according to the editor's preference before the page is saved.  It
    This function gets the rendered form of an equation according to the editor's preference before the page is saved.  It
    then searches the rendering for the text equivalent of the rendered equation and replaces the rendering with that so
    then searches the rendering for the text equivalent of the rendered equation and replaces the rendering with that so
    that the page is saved without extraneous html/css markup and with a reasonably readable text form of the equation.
    that the page is saved without extraneous HTML/CSS markup and with a reasonably readable text form of the equation.


    When a replacement is made, this function returns true and the value with replacement; otherwise false and the intital
    When a replacement is made, this function returns true and the value with replacement; otherwise false and the initial
    value.  To replace multipe equations it is necesary to call this function from within a loop.
    value.  To replace multipe equations it is necessary to call this function from within a loop.


    ]=]
    ]=]
    Line 154: Line 120:
    --[[--------------------------< C O I N S _ C L E A N U P >----------------------------------------------------
    --[[--------------------------< C O I N S _ C L E A N U P >----------------------------------------------------


    Cleanup parameter values for the metadata by removing or replacing invisible characters and certain html entities.
    Cleanup parameter values for the metadata by removing or replacing invisible characters and certain HTML entities.


    2015-12-10: there is a bug in mw.text.unstripNoWiki ().  It replaces math stripmarkers with the appropriate content
    2015-12-10: there is a bug in mw.text.unstripNoWiki ().  It replaces math stripmarkers with the appropriate content
    Line 177: Line 143:
    value = value:gsub ('&nbsp;', ' '); -- replace &nbsp; entity with plain space
    value = value:gsub ('&nbsp;', ' '); -- replace &nbsp; entity with plain space
    value = value:gsub ('\226\128\138', ' '); -- replace hair space with plain space
    value = value:gsub ('\226\128\138', ' '); -- replace hair space with plain space
    if not mw.ustring.find (value, cfg.indic_script) then -- don't remove zero width joiner characters from indic script
    if not mw.ustring.find (value, cfg.indic_script) then -- don't remove zero-width joiner characters from indic script
    value = value:gsub ('&zwj;', ''); -- remove &zwj; entities
    value = value:gsub ('&zwj;', ''); -- remove &zwj; entities
    value = mw.ustring.gsub (value, '[\226\128\141\226\128\139\194\173]', ''); -- remove zero-width joiner, zero-width space, soft hyphen
    value = mw.ustring.gsub (value, '[\226\128\141\226\128\139\194\173]', ''); -- remove zero-width joiner, zero-width space, soft hyphen
    end
    end
    value = value:gsub ('[\009\010\013]', ' '); -- replace horizontal tab, line feed, carriage return with plain space
    value = value:gsub ('[\009\010\013 ]+', ' '); -- replace horizontal tab, line feed, carriage return with plain space
    return value;
    return value;
    end
    end
    Line 214: Line 180:
    });
    });
    if in_array (class, {'arxiv', 'biorxiv', 'citeseerx', 'journal', 'news', 'magazine'}) or (in_array (class, {'conference', 'interview', 'map', 'press release', 'web'}) and is_set(data.Periodical)) or  
    if in_array (class, {'arxiv', 'biorxiv', 'citeseerx', 'ssrn', 'journal', 'news', 'magazine'}) or (in_array (class, {'conference', 'interview', 'map', 'press release', 'web'}) and is_set(data.Periodical)) or  
    ('citation' == class and is_set(data.Periodical) and not is_set (data.Encyclopedia)) then
    ('citation' == class and is_set(data.Periodical) and not is_set (data.Encyclopedia)) then
    OCinSoutput.rft_val_fmt = "info:ofi/fmt:kev:mtx:journal"; -- journal metadata identifier
    OCinSoutput.rft_val_fmt = "info:ofi/fmt:kev:mtx:journal"; -- journal metadata identifier
    if in_array (class, {'arxiv', 'biorxiv', 'citeseerx'}) then -- set genre according to the type of citation template we are rendering
    if in_array (class, {'arxiv', 'biorxiv', 'citeseerx', 'ssrn'}) then -- set genre according to the type of citation template we are rendering
    OCinSoutput["rft.genre"] = "preprint"; -- cite arxiv, cite biorxiv, cite citeseerx
    OCinSoutput["rft.genre"] = "preprint"; -- cite arxiv, cite biorxiv, cite citeseerx, cite ssrn
    elseif 'conference' == class then
    elseif 'conference' == class then
    OCinSoutput["rft.genre"] = "conference"; -- cite conference (when Periodical set)
    OCinSoutput["rft.genre"] = "conference"; -- cite conference (when Periodical set)
    Line 231: Line 197:
    -- these used only for periodicals
    -- these used only for periodicals
    OCinSoutput["rft.ssn"] = data.Season; -- keywords: winter, spring, summer, fall
    OCinSoutput["rft.ssn"] = data.Season; -- keywords: winter, spring, summer, fall
    OCinSoutput["rft.quarter"] = data.Quarter; -- single digits 1->first quarter, etc.
    OCinSoutput["rft.chron"] = data.Chron; -- free-form date components
    OCinSoutput["rft.chron"] = data.Chron; -- free-form date components
    OCinSoutput["rft.volume"] = data.Volume; -- does not apply to books
    OCinSoutput["rft.volume"] = data.Volume; -- does not apply to books
    Line 254: Line 221:
    end
    end
    end
    end
    else --{'audio-visual', 'AV-media-notes', 'DVD-notes', 'episode', 'interview', 'mailinglist', 'map', 'newsgroup', 'podcast', 'press release', 'serial', 'sign', 'speech', 'web'}
    else -- {'audio-visual', 'AV-media-notes', 'DVD-notes', 'episode', 'interview', 'mailinglist', 'map', 'newsgroup', 'podcast', 'press release', 'serial', 'sign', 'speech', 'web'}
    OCinSoutput["rft.genre"] = "unknown";
    OCinSoutput["rft.genre"] = "unknown";
    end
    end
    Line 270: Line 237:
    OCinSoutput['rft.inst'] = data.PublisherName; -- book and dissertation
    OCinSoutput['rft.inst'] = data.PublisherName; -- book and dissertation
    end
    end
    -- NB. Not currently supported are "info:ofi/fmt:kev:mtx:patent", "info:ofi/fmt:kev:mtx:dc", "info:ofi/fmt:kev:mtx:sch_svc", "info:ofi/fmt:kev:mtx:ctx"
    -- and now common parameters (as much as possible)
    -- and now common parameters (as much as possible)
    OCinSoutput["rft.date"] = data.Date; -- book, journal, dissertation
    OCinSoutput["rft.date"] = data.Date; -- book, journal, dissertation
    -- TODO: data.ID_list should also contain identifier validity status (to suppress output of invalid identifier metadata) as well as OL A/M/W/X type and ASIN TLD info (to special-case prefix generation)
    for k, v in pairs( data.ID_list ) do -- what to do about these? For now assume that they are common to all?
    for k, v in pairs( data.ID_list ) do -- what to do about these? For now assume that they are common to all?
    -- if k == 'ISBN' then v = clean_isbn( v ) end
    if k == 'ISBN' then v = v:gsub( "[^-0-9X]", "" ); end
    if k == 'ISBN' then v = v:gsub( "[^-0-9X]", "" ); end
    local id = cfg.id_handlers[k].COinS;
    local id = cfg.id_handlers[k].COinS;
    if string.sub( id or "", 1, 4 ) == 'info' then -- for ids that are in the info:registry
    if string.sub( id or "", 1, 4 ) == 'info' then -- for ids that are in the info:registry
    OCinSoutput["rft_id"] = table.concat{ id, "/", v };
    OCinSoutput["rft_id"] = table.concat{ id, "/", v };
    elseif string.sub (id or "", 1, 3 ) == 'rft' then -- for isbn, issn, eissn, etc that have defined COinS keywords
    elseif string.sub (id or "", 1, 3 ) == 'rft' then -- for isbn, issn, eissn, etc. that have defined COinS keywords
    OCinSoutput[ id ] = v;
    OCinSoutput[ id ] = v;
    elseif id then -- when cfg.id_handlers[k].COinS is not nil
    elseif id then -- when cfg.id_handlers[k].COinS is not nil
    OCinSoutput["rft_id"] = table.concat{ cfg.id_handlers[k].prefix, v }; -- others; provide a url
    OCinSoutput["rft_id"] = table.concat{ cfg.id_handlers[k].prefix, v, cfg.id_handlers[k].suffix or '', "#id-name=", cfg.id_handlers[k].label }; -- others; provide a URL and indicate identifier name as #fragment (human-readable, but transparent to browsers)
    end
    end
    end
    end


    --[[
    for k, v in pairs( data.ID_list ) do -- what to do about these? For now assume that they are common to all?
    local id, value = cfg.id_handlers[k].COinS;
    if k == 'ISBN' then value = clean_isbn( v ); else value = v; end
    if string.sub( id or "", 1, 4 ) == 'info' then
    OCinSoutput["rft_id"] = table.concat{ id, "/", v };
    else
    OCinSoutput[ id ] = value;
    end
    end
    ]]
    local last, first;
    local last, first;
    for k, v in ipairs( data.Authors ) do
    for k, v in ipairs( data.Authors ) do
    last, first = coins_cleanup (v.last), coins_cleanup (v.first or ''); -- replace any nowiki strip markers, non-printing or invisible characers
    last, first = coins_cleanup (v.last), coins_cleanup (v.first or ''); -- replace any nowiki stripmarkers, non-printing or invisible characters
    if k == 1 then -- for the first author name only
    if k == 1 then -- for the first author name only
    if is_set(last) and is_set(first) then -- set these COinS values if |first= and |last= specify the first author name
    if is_set(last) and is_set(first) then -- set these COinS values if |first= and |last= specify the first author name
    OCinSoutput["rft.aulast"] = last; -- book, journal, dissertation
    OCinSoutput["rft.aulast"] = last; -- book, journal, dissertation
    OCinSoutput["rft.aufirst"] = first; -- book, journal, dissertation
    OCinSoutput["rft.aufirst"] = first; -- book, journal, dissertation
    Line 313: Line 271:
    OCinSoutput["rft.au"] = last; -- book, journal, dissertation
    OCinSoutput["rft.au"] = last; -- book, journal, dissertation
    end
    end
    -- TODO: At present we do not report "et al.". Add anything special if this condition applies?
    end
    end
    end
    end
    Line 318: Line 277:
    OCinSoutput.rft_id = data.URL;
    OCinSoutput.rft_id = data.URL;
    OCinSoutput.rfr_id = table.concat{ "info:sid/", mw.site.server:match( "[^/]*$" ), ":", data.RawPage };
    OCinSoutput.rfr_id = table.concat{ "info:sid/", mw.site.server:match( "[^/]*$" ), ":", data.RawPage };
    -- TODO: Add optional extra info:
    -- rfr_dat=#REVISION<version> (referrer private data)
    -- ctx_id=<data.RawPage>#<ref> (identifier for the context object)
    -- ctx_tim=<ts> (timestamp in format yyyy-mm-ddThh:mm:ssTZD or yyyy-mm-dd)
    -- ctx_enc=info:ofi/enc:UTF-8 (character encoding)
    OCinSoutput = setmetatable( OCinSoutput, nil );
    OCinSoutput = setmetatable( OCinSoutput, nil );
     
    -- sort with version string always first, and combine.
    -- sort with version string always first, and combine.
    --table.sort( OCinSoutput );
    -- table.sort( OCinSoutput );
    table.insert( OCinSoutput, 1, "ctx_ver=" .. ctx_ver ); -- such as "Z39.88-2004"
    table.insert( OCinSoutput, 1, "ctx_ver=" .. ctx_ver ); -- such as "Z39.88-2004"
    return table.concat(OCinSoutput, "&");
    return table.concat(OCinSoutput, "&");
    end
    end
    Line 336: Line 302:
    cfg = cfg_table_ptr;
    cfg = cfg_table_ptr;


    is_set = utilities_page_ptr.is_set; -- import functions from select Module:Citation/CS1/Utilities module
    is_set = utilities_page_ptr.is_set; -- import functions from selected Module:Citation/CS1/Utilities module
    in_array = utilities_page_ptr.in_array;
    in_array = utilities_page_ptr.in_array;
    remove_wiki_link = utilities_page_ptr.remove_wiki_link;
    remove_wiki_link = utilities_page_ptr.remove_wiki_link;
    strip_apostrophe_markup = utilities_page_ptr.strip_apostrophe_markup;
    end
    end




    --[[--------------------------< E X P O R T E D  F U N C T I O N S >------------------------------------------
    ]]


    return {
    return {