From 6dfee624cb4a3c35f67e7de2235f33e6cf8dbffb Mon Sep 17 00:00:00 2001 From: aaronskiba Date: Sun, 14 Sep 2025 22:14:06 -0600 Subject: [PATCH 01/14] Percent-encode non-ASCII chars in `query_ror` `query_ror()` calls `http_get()`, which includes a `HTTParty.get()` call. Without this percent-encoding, HTTParty throws InvalidURIError when given non-ASCII characters. TODO: Determine if any other `http_get()` callers require percent-encoding. --- app/services/external_apis/ror_service.rb | 6 +++++- 1 file changed, 5 insertions(+), 1 deletion(-) diff --git a/app/services/external_apis/ror_service.rb b/app/services/external_apis/ror_service.rb index e639bf2b94..3c646105c9 100644 --- a/app/services/external_apis/ror_service.rb +++ b/app/services/external_apis/ror_service.rb @@ -81,9 +81,13 @@ def search(term:, filters: []) def query_ror(term:, page: 1, filters: []) return [] unless term.present? + # Percent-encode the term + # (HTTParty.get() throws InvalidURIError when given non-ASCII characters) + encoded_term = URI.encode_www_form_component(term) + # build the URL target = "#{api_base_url}#{search_path}" - query = query_string(term: term, page: page, filters: filters) + query = query_string(term: encoded_term, page: page, filters: filters) # Call the ROR API and log any errors resp = http_get(uri: "#{target}?#{query}", additional_headers: {}, From 5c754362411e18b74d858e81541d651a3f2f439e Mon Sep 17 00:00:00 2001 From: aaronskiba Date: Tue, 26 Aug 2025 13:59:03 -0600 Subject: [PATCH 02/14] Make `v1` explicit when using ROR api Our code relies on ROR's v1 API. https://ror.readme.io/docs/rest-api states the following: Changes to the ROR API begin the week of July 28, 2025 Beginning the week of July 28, 2025, ROR API requests with no version in the path will default to responses that use version 2 of the ROR schema instead of version 1. Read more in our changelog. --- config/initializers/external_apis/ror.rb | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/config/initializers/external_apis/ror.rb b/config/initializers/external_apis/ror.rb index 2b37937234..c324638063 100644 --- a/config/initializers/external_apis/ror.rb +++ b/config/initializers/external_apis/ror.rb @@ -5,7 +5,7 @@ # the API and to verify that your configuration settings are correct, # please refer to: https://github.com/ror-community/ror-api Rails.configuration.x.ror.landing_page_url = 'https://ror.org/' -Rails.configuration.x.ror.api_base_url = 'https://api.ror.org/' +Rails.configuration.x.ror.api_base_url = 'https://api.ror.org/v1/' Rails.configuration.x.ror.heartbeat_path = 'heartbeat' Rails.configuration.x.ror.search_path = 'organizations' Rails.configuration.x.ror.max_pages = 2 From e1de44c9a6316f1388963814a2968b1a54e4f522 Mon Sep 17 00:00:00 2001 From: aaronskiba Date: Tue, 26 Aug 2025 14:12:19 -0600 Subject: [PATCH 03/14] Remove repetitive rendering of ROR/Fundref scheme names This change addresses the following suggestion for improving the UX (link includes screenshots): https://github.com/portagenetwork/roadmap/issues/837#issuecomment-3224124609 --- app/views/orgs/_external_identifiers.html.erb | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/app/views/orgs/_external_identifiers.html.erb b/app/views/orgs/_external_identifiers.html.erb index 010180c216..04d738ac95 100644 --- a/app/views/orgs/_external_identifiers.html.erb +++ b/app/views/orgs/_external_identifiers.html.erb @@ -17,7 +17,7 @@
<% id = presenter.id_for_scheme(scheme: scheme) %> <%= scheme.description %>: - <%= id_for_display(id: id) %> + <%= id_for_display(id: id, with_scheme_name: false) %>
<% end %> @@ -62,7 +62,7 @@
<% id = presenter.id_for_scheme(scheme: scheme) %> <%= scheme.description %>: - <%= id_for_display(id: id) %> + <%= id_for_display(id: id, with_scheme_name: false) %>
<% end %> From 6bb157fb0ccb2ea3b40941df012b767f51718633 Mon Sep 17 00:00:00 2001 From: aaronskiba Date: Tue, 26 Aug 2025 15:59:07 -0600 Subject: [PATCH 04/14] cp `upgrade:retrieve_ror_fundref_ids` to `lib/tasks/orgs.rake` --- lib/tasks/orgs.rake | 72 +++++++++++++++++++++++++++++++++++++++++++++ 1 file changed, 72 insertions(+) create mode 100644 lib/tasks/orgs.rake diff --git a/lib/tasks/orgs.rake b/lib/tasks/orgs.rake new file mode 100644 index 0000000000..af0080074d --- /dev/null +++ b/lib/tasks/orgs.rake @@ -0,0 +1,72 @@ +# frozen_string_literal: true + +namespace :orgs do + desc 'retrieves ROR ids for each of the Orgs defined in the database' + task retrieve_ror_fundref_ids: :environment do + ror = IdentifierScheme.find_by(name: 'ror') + fundref = IdentifierScheme.find_by(name: 'fundref') + + out = CSV.generate do |csv| + csv << %w[org_id org_name ror_name ror_id fundref_id] + + if ExternalApis::RorService.ping + # rubocop:disable Layout/LineLength + p 'Scanning ROR for each of your existing Orgs' + p 'The results will be written to tmp/ror_fundref_ids.csv to facilitate review and any corrections that may need to be made.' + p 'The CSV file contains the Org name stored in your DB next to the ROR org name that was matched. Use these 2 values to determine if the match was valid.' + p 'You can use the ROR search page to find the correct match for any organizations that need to be corrected: https://ror.org/search' + p '' + # rubocop:enable Layout/LineLength + orgs = Org.includes(identifiers: :identifier_scheme) + .where(is_other: false).order(:name) + + orgs.each do |org| + # If the Org already has a ROR identifier skip it + next if org.identifiers.any? { |id| id.identifier_scheme_id == ror.id } + + # The abbreviation sometimes causes weird results so strip it off + # in this instance + org_name = org.name.gsub(" (#{org.abbreviation})", '') + rslts = OrgSelection::SearchService.search_externally(search_term: org_name) + next unless rslts.any? + + # Just use the first match that contains the search term + rslt = rslts.find { |r| r[:weight] <= 1 } + next unless rslt.present? + + ror_id = rslt[:ror] + fundref_id = rslt[:fundref] + + if ror_id.present? + ror_ident = Identifier.find_or_initialize_by(identifiable: org, + identifier_scheme: ror) + ror_ident.value = "#{ror.identifier_prefix}#{ror_id}" + ror_ident.save + p " #{org.name} -> ROR: #{ror_ident.value}, #{rslt[:name]}" + end + if fundref_id.present? + fr_ident = Identifier.find_or_initialize_by(identifiable: org, + identifier_scheme: fundref) + fr_ident.value = "#{fundref.identifier_prefix}#{fundref_id}" + fr_ident.save + p " #{org.name} -> FUNDRF: #{fr_ident.value}, #{rslt[:name]}" + end + + if ror_id.present? || fundref_id.present? + csv << [org.id, org.name, rslt[:name], ror_ident&.value, fr_ident&.value] + end + end + else + # rubocop:disable Layout/LineLength + p 'ROR appears to be offline or your configuration is invalid. Heartbeat check failed. Refer to the log for more information.' + # rubocop:enable Layout/LineLength + end + end + + if out.present? + file = File.open('tmp/ror_fundref_ids.csv', 'w') + file.puts out + file.close + end + end +end From 4456e971dc6f03fa151c5a5417a5dbebc2f54c88 Mon Sep 17 00:00:00 2001 From: aaronskiba Date: Tue, 26 Aug 2025 16:14:08 -0600 Subject: [PATCH 05/14] Apply .managed fIlter to ror/fundref org updates --- lib/tasks/orgs.rake | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/lib/tasks/orgs.rake b/lib/tasks/orgs.rake index af0080074d..0ef60f7723 100644 --- a/lib/tasks/orgs.rake +++ b/lib/tasks/orgs.rake @@ -18,7 +18,7 @@ namespace :orgs do p '' # rubocop:enable Layout/LineLength orgs = Org.includes(identifiers: :identifier_scheme) - .where(is_other: false).order(:name) + .where(managed: true, is_other: false).order(:name) orgs.each do |org| # If the Org already has a ROR identifier skip it From 6120184155742aece3db2062971d03b38990479b Mon Sep 17 00:00:00 2001 From: aaronskiba Date: Wed, 10 Sep 2025 12:31:56 -0600 Subject: [PATCH 06/14] Guard against missing IdentifierScheme(s) This change ensures that the task exits gracefully if either of the required schemes (ror or fundref) are missing from the db. Refactoring is also performed. --- lib/tasks/orgs.rake | 15 +++++++++++++-- 1 file changed, 13 insertions(+), 2 deletions(-) diff --git a/lib/tasks/orgs.rake b/lib/tasks/orgs.rake index 0ef60f7723..b666ec529f 100644 --- a/lib/tasks/orgs.rake +++ b/lib/tasks/orgs.rake @@ -3,8 +3,8 @@ namespace :orgs do desc 'retrieves ROR ids for each of the Orgs defined in the database' task retrieve_ror_fundref_ids: :environment do - ror = IdentifierScheme.find_by(name: 'ror') - fundref = IdentifierScheme.find_by(name: 'fundref') + ror, fundref = fetch_identifier_schemes + return unless ror && fundref out = CSV.generate do |csv| csv << %w[org_id org_name ror_name ror_id fundref_id] @@ -69,4 +69,15 @@ namespace :orgs do file.close end end + + def fetch_identifier_schemes + ror = IdentifierScheme.find_by(name: 'ror') + fundref = IdentifierScheme.find_by(name: 'fundref') + + if ror.nil? || fundref.nil? + p "Missing IdentifierScheme(s): ror: #{ror.inspect}, fundref: #{fundref.inspect}" + p 'Both must exist in DB for this task to run.' + end + [ror, fundref] + end end From 16a037ab156cff91bc2b2980a805308f319a13bf Mon Sep 17 00:00:00 2001 From: aaronskiba Date: Wed, 10 Sep 2025 11:37:48 -0600 Subject: [PATCH 07/14] Refactor: Simplify CSV logic in org/ror rake task - Replaced CSV.generate + File.open with CSV.open - This change should also be more memory-efficient because rows are now written directly. Prior to this change, the full string was built in memory. --- lib/tasks/orgs.rake | 8 +------- 1 file changed, 1 insertion(+), 7 deletions(-) diff --git a/lib/tasks/orgs.rake b/lib/tasks/orgs.rake index b666ec529f..a80393fb9a 100644 --- a/lib/tasks/orgs.rake +++ b/lib/tasks/orgs.rake @@ -6,7 +6,7 @@ namespace :orgs do ror, fundref = fetch_identifier_schemes return unless ror && fundref - out = CSV.generate do |csv| + CSV.open('tmp/ror_fundref_ids.csv', 'w') do |csv| csv << %w[org_id org_name ror_name ror_id fundref_id] if ExternalApis::RorService.ping @@ -62,12 +62,6 @@ namespace :orgs do # rubocop:enable Layout/LineLength end end - - if out.present? - file = File.open('tmp/ror_fundref_ids.csv', 'w') - file.puts out - file.close - end end def fetch_identifier_schemes From 60f708abc1c8f3dfc0a3e9ef275876d5f0ce5d72 Mon Sep 17 00:00:00 2001 From: aaronskiba Date: Thu, 11 Sep 2025 10:32:40 -0600 Subject: [PATCH 08/14] Improve best match result in org / ror rake task `OrgSelection::SearchService#weigh` states "The lower the weight the closer the match". - This change still attempts to find a result with weight <= 1. However, now a result with weight == 0 is prioritised, allowing for closer potential matches. --- lib/tasks/orgs.rake | 5 +++-- 1 file changed, 3 insertions(+), 2 deletions(-) diff --git a/lib/tasks/orgs.rake b/lib/tasks/orgs.rake index a80393fb9a..14b9a48821 100644 --- a/lib/tasks/orgs.rake +++ b/lib/tasks/orgs.rake @@ -30,8 +30,9 @@ namespace :orgs do rslts = OrgSelection::SearchService.search_externally(search_term: org_name) next unless rslts.any? - # Just use the first match that contains the search term - rslt = rslts.find { |r| r[:weight] <= 1 } + # Find the best match + # (See OrgSelection::SearchService#weigh for how weight is calculated.) + rslt = rslts.find { |r| (r[:weight]).zero? } || rslts.find { |r| r[:weight] == 1 } next unless rslt.present? ror_id = rslt[:ror] From 7950593614c71b3d2e5740495cdfe5dcb8307a65 Mon Sep 17 00:00:00 2001 From: aaronskiba Date: Thu, 11 Sep 2025 12:45:01 -0600 Subject: [PATCH 09/14] Refactor orgs/ROR rake task --- lib/tasks/orgs.rake | 161 +++++++++++++++++++++++++++----------------- 1 file changed, 100 insertions(+), 61 deletions(-) diff --git a/lib/tasks/orgs.rake b/lib/tasks/orgs.rake index 14b9a48821..13cc2601e3 100644 --- a/lib/tasks/orgs.rake +++ b/lib/tasks/orgs.rake @@ -1,66 +1,29 @@ # frozen_string_literal: true +CSV_FILE_PATH = Rails.root.join('tmp', 'ror_fundref_ids.csv') +CSV_HEADERS = %w[org_id org_name ror_name ror_id fundref_id].freeze + namespace :orgs do - desc 'retrieves ROR ids for each of the Orgs defined in the database' - task retrieve_ror_fundref_ids: :environment do + desc 'Updates DB and Creates CSV with Org-related ROR/Fundref data' + task update_ror_data: :environment do ror, fundref = fetch_identifier_schemes - return unless ror && fundref - - CSV.open('tmp/ror_fundref_ids.csv', 'w') do |csv| - csv << %w[org_id org_name ror_name ror_id fundref_id] - - if ExternalApis::RorService.ping - # rubocop:disable Layout/LineLength - p 'Scanning ROR for each of your existing Orgs' - p 'The results will be written to tmp/ror_fundref_ids.csv to facilitate review and any corrections that may need to be made.' - p 'The CSV file contains the Org name stored in your DB next to the ROR org name that was matched. Use these 2 values to determine if the match was valid.' - p 'You can use the ROR search page to find the correct match for any organizations that need to be corrected: https://ror.org/search' - p '' - # rubocop:enable Layout/LineLength - orgs = Org.includes(identifiers: :identifier_scheme) - .where(managed: true, is_other: false).order(:name) - - orgs.each do |org| - # If the Org already has a ROR identifier skip it - next if org.identifiers.any? { |id| id.identifier_scheme_id == ror.id } - - # The abbreviation sometimes causes weird results so strip it off - # in this instance - org_name = org.name.gsub(" (#{org.abbreviation})", '') - rslts = OrgSelection::SearchService.search_externally(search_term: org_name) - next unless rslts.any? - - # Find the best match - # (See OrgSelection::SearchService#weigh for how weight is calculated.) - rslt = rslts.find { |r| (r[:weight]).zero? } || rslts.find { |r| r[:weight] == 1 } - next unless rslt.present? - - ror_id = rslt[:ror] - fundref_id = rslt[:fundref] - - if ror_id.present? - ror_ident = Identifier.find_or_initialize_by(identifiable: org, - identifier_scheme: ror) - ror_ident.value = "#{ror.identifier_prefix}#{ror_id}" - ror_ident.save - p " #{org.name} -> ROR: #{ror_ident.value}, #{rslt[:name]}" - end - if fundref_id.present? - fr_ident = Identifier.find_or_initialize_by(identifiable: org, - identifier_scheme: fundref) - fr_ident.value = "#{fundref.identifier_prefix}#{fundref_id}" - fr_ident.save - p " #{org.name} -> FUNDRF: #{fr_ident.value}, #{rslt[:name]}" - end - - if ror_id.present? || fundref_id.present? - csv << [org.id, org.name, rslt[:name], ror_ident&.value, fr_ident&.value] - end - end - else - # rubocop:disable Layout/LineLength - p 'ROR appears to be offline or your configuration is invalid. Heartbeat check failed. Refer to the log for more information.' - # rubocop:enable Layout/LineLength + # Only proceed if the identifier schemes and the ROR API are all available + return unless ror && fundref && ror_service_available? + + print_intro_message + + CSV.open(CSV_FILE_PATH, 'w', write_headers: true, headers: CSV_HEADERS) do |csv| + org_scope.each do |org| + # If the Org already has a ROR identifier skip it + next if org_has_ror_identifier?(org, ror) + + rslts = ror_search_results_for_org(org) + next unless rslts.any? + + rslt = best_match_from_results(rslts) + next unless rslt.present? + + handle_result(org, ror, fundref, rslt, csv) end end end @@ -70,9 +33,85 @@ namespace :orgs do fundref = IdentifierScheme.find_by(name: 'fundref') if ror.nil? || fundref.nil? - p "Missing IdentifierScheme(s): ror: #{ror.inspect}, fundref: #{fundref.inspect}" - p 'Both must exist in DB for this task to run.' + puts "Missing IdentifierScheme(s): ror: #{ror.inspect}, fundref: #{fundref.inspect}" + puts 'Both must exist in DB for this task to run.' end [ror, fundref] end + + def ror_service_available? + ok = ExternalApis::RorService.ping + unless ok + puts 'ROR appears to be offline or your configuration is invalid. ' \ + 'Heartbeat check failed. Refer to the log for more information.' + end + ok + end + + def org_has_ror_identifier?(org, ror) + org.identifiers.any? { |id| id.identifier_scheme_id == ror.id } + end + + def print_intro_message + puts <<~MSG + Scanning ROR for each of your existing Orgs. + The results will be written to "#{CSV_FILE_PATH}" to facilitate#{' '} + review and any corrections that may need to be made. + The CSV file contains the Org name stored in your DB next to the ROR org#{' '} + name that was matched. Use these 2 values to determine if the match was valid. + You can use the ROR search page to find the correct match for any organizations#{' '} + that need to be corrected: https://ror.org/search + + MSG + end + + def org_scope + scope = Org.includes(identifiers: :identifier_scheme) + .where(managed: true, is_other: false) + .order(:name) + puts "Found #{scope.size} org(s) to process." + scope + end + + def ror_search_results_for_org(org) + # The abbreviation sometimes causes weird results so strip it off in this instance + org_name = org.name.gsub(" (#{org.abbreviation})", '') + OrgSelection::SearchService.search_externally(search_term: org_name) + end + + def best_match_from_results(rslts) + # Find the best match + # (See OrgSelection::SearchService#weigh for how weight is calculated.) + rslts.find { |r| (r[:weight]).zero? } || rslts.find { |r| r[:weight] == 1 } + end + + def handle_result(org, ror, fundref, result, csv) + return unless result[:ror].present? || result[:fundref].present? + + # Save ROR and FUNDREF entries to DB + identifiers = handle_identifiers(org, ror, fundref, result) + # Add entry to generated CSV + csv << [org.id, org.name, result[:name], identifiers[:ror]&.value, identifiers[:fundref]&.value] + end + + def handle_identifiers(org, ror, fundref, result) + { + ror: handle_identifier(org, ror, result[:ror], result[:name], 'ROR'), + fundref: handle_identifier(org, fundref, result[:fundref], result[:name], 'FUNDREF') + } + end + + def handle_identifier(org, identifier_scheme, id, name, label) + return unless id.present? + + identifier = Identifier.find_or_initialize_by(identifiable: org, + identifier_scheme: identifier_scheme) + begin + identifier.update!(value: "#{identifier_scheme.identifier_prefix}#{id}") + puts "#{org.name} -> #{label}: #{identifier.value}, #{name}" + rescue StandardError => e + puts "Failed to update #{org.name} -> #{label}: #{e.message}" + end + identifier + end end From c4e725f79ed8fbeaad69b7c0ab284ffc54daab85 Mon Sep 17 00:00:00 2001 From: aaronskiba Date: Sun, 14 Sep 2025 22:29:35 -0600 Subject: [PATCH 10/14] Update CSV handling: add weights and unmatched results - Updated CSV to include weight column. Knowing this value provides us with a level of confidence that we can have in the new Identifier entries we are writing to the db. - Updated rake task to also log unmatched results to generated CSV. With these values/org names, we can determine whether ROR/Fundref values aren't available, or if there may be issues with the org names themselves, or rake task itself, etc. - Also added puts statements for unmatched results - Renamed variable `rslt` to `result` and `rstls` to `results` --- lib/tasks/orgs.rake | 35 +++++++++++++++++++++-------------- 1 file changed, 21 insertions(+), 14 deletions(-) diff --git a/lib/tasks/orgs.rake b/lib/tasks/orgs.rake index 13cc2601e3..cc12584d1d 100644 --- a/lib/tasks/orgs.rake +++ b/lib/tasks/orgs.rake @@ -1,7 +1,7 @@ # frozen_string_literal: true CSV_FILE_PATH = Rails.root.join('tmp', 'ror_fundref_ids.csv') -CSV_HEADERS = %w[org_id org_name ror_name ror_id fundref_id].freeze +CSV_HEADERS = %w[org_id org_name ror_name ror_id fundref_id weight].freeze namespace :orgs do desc 'Updates DB and Creates CSV with Org-related ROR/Fundref data' @@ -17,13 +17,13 @@ namespace :orgs do # If the Org already has a ROR identifier skip it next if org_has_ror_identifier?(org, ror) - rslts = ror_search_results_for_org(org) - next unless rslts.any? - - rslt = best_match_from_results(rslts) - next unless rslt.present? - - handle_result(org, ror, fundref, rslt, csv) + results = ror_search_results_for_org(org) + result = best_match_from_results(results) if results.any? + if result.present? + handle_matched_result(org, ror, fundref, result, csv) + else + handle_unmatched_result(org, csv) + end end end end @@ -79,19 +79,24 @@ namespace :orgs do OrgSelection::SearchService.search_externally(search_term: org_name) end - def best_match_from_results(rslts) + def best_match_from_results(results) # Find the best match # (See OrgSelection::SearchService#weigh for how weight is calculated.) - rslts.find { |r| (r[:weight]).zero? } || rslts.find { |r| r[:weight] == 1 } + results.find { |r| (r[:weight]).zero? } || results.find { |r| r[:weight] == 1 } + end + + def handle_unmatched_result(org, csv) + puts "⚠️ No results found for Org with id: #{org.id} and name: #{org.name}" + csv << [org.id, org.name, nil, nil, nil, nil] end - def handle_result(org, ror, fundref, result, csv) + def handle_matched_result(org, ror, fundref, result, csv) return unless result[:ror].present? || result[:fundref].present? # Save ROR and FUNDREF entries to DB identifiers = handle_identifiers(org, ror, fundref, result) # Add entry to generated CSV - csv << [org.id, org.name, result[:name], identifiers[:ror]&.value, identifiers[:fundref]&.value] + csv << [org.id, org.name, result[:name], identifiers[:ror]&.value, identifiers[:fundref]&.value, result[:weight]] end def handle_identifiers(org, ror, fundref, result) @@ -108,9 +113,11 @@ namespace :orgs do identifier_scheme: identifier_scheme) begin identifier.update!(value: "#{identifier_scheme.identifier_prefix}#{id}") - puts "#{org.name} -> #{label}: #{identifier.value}, #{name}" + puts "✅ Updated #{org.name} -> #{label}: #{identifier.value}, #{name}" rescue StandardError => e - puts "Failed to update #{org.name} -> #{label}: #{e.message}" + message = "❌ Failed to update #{org.name} -> #{label}: #{e.message}" + puts message + Rails.logger.error(message) end identifier end From 73ec93fb6371b20115980c5f4f5b16e14fbcc23b Mon Sep 17 00:00:00 2001 From: aaronskiba Date: Mon, 15 Sep 2025 12:41:51 -0600 Subject: [PATCH 11/14] Refactor: Namespace ROR Rake task in module Refactor `task update_ror_data` into `module Orgs::UpdateRorService`. - Helper methods now live inside the module instead of globally, preventing accidental overrides by other tasks or code. - The module is placed in `app/services/orgs/` to follow Rails conventions, enabling autoloading and keeping service code separate from Rake task definitions. - Using Orgs (plural) for the module namespace avoids conflicts with the existing Org ActiveRecord model. --- app/services/orgs/update_ror_service.rb | 127 ++++++++++++++++++++++++ lib/tasks/orgs.rake | 120 +--------------------- 2 files changed, 128 insertions(+), 119 deletions(-) create mode 100644 app/services/orgs/update_ror_service.rb diff --git a/app/services/orgs/update_ror_service.rb b/app/services/orgs/update_ror_service.rb new file mode 100644 index 0000000000..8a4e116e3c --- /dev/null +++ b/app/services/orgs/update_ror_service.rb @@ -0,0 +1,127 @@ +module Orgs + module UpdateRorService + extend self + + CSV_FILE_PATH = Rails.root.join('tmp', 'ror_fundref_ids.csv') + CSV_HEADERS = %w[org_id org_name ror_name ror_id fundref_id weight].freeze + + def run + ror, fundref = fetch_identifier_schemes + # Only proceed if the identifier schemes and the ROR API are all available + return unless ror && fundref && ror_service_available? + + print_intro_message + + CSV.open(CSV_FILE_PATH, 'w', write_headers: true, headers: CSV_HEADERS) do |csv| + org_scope.each do |org| + # If the Org already has a ROR identifier skip it + next if org_has_ror_identifier?(org, ror) + + results = ror_search_results_for_org(org) + result = best_match_from_results(results) if results.any? + if result.present? + handle_matched_result(org, ror, fundref, result, csv) + else + handle_unmatched_result(org, csv) + end + end + end + end + + private + + def fetch_identifier_schemes + ror = IdentifierScheme.find_by(name: 'ror') + fundref = IdentifierScheme.find_by(name: 'fundref') + + if ror.nil? || fundref.nil? + puts "Missing IdentifierScheme(s): ror: #{ror.inspect}, fundref: #{fundref.inspect}" + puts 'Both must exist in DB for this task to run.' + end + [ror, fundref] + end + + def ror_service_available? + ok = ExternalApis::RorService.ping + unless ok + puts 'ROR appears to be offline or your configuration is invalid. ' \ + 'Heartbeat check failed. Refer to the log for more information.' + end + ok + end + + def org_has_ror_identifier?(org, ror) + org.identifiers.any? { |id| id.identifier_scheme_id == ror.id } + end + + def print_intro_message + puts <<~MSG + Scanning ROR for each of your existing Orgs. + The results will be written to "#{CSV_FILE_PATH}" to facilitate#{' '} + review and any corrections that may need to be made. + The CSV file contains the Org name stored in your DB next to the ROR org#{' '} + name that was matched. Use these 2 values to determine if the match was valid. + You can use the ROR search page to find the correct match for any organizations#{' '} + that need to be corrected: https://ror.org/search + + MSG + end + + def org_scope + scope = Org.includes(identifiers: :identifier_scheme) + .where(managed: true, is_other: false) + .order(:name) + puts "Found #{scope.size} org(s) to process." + scope + end + + def ror_search_results_for_org(org) + # The abbreviation sometimes causes weird results so strip it off in this instance + org_name = org.name.gsub(" (#{org.abbreviation})", '') + OrgSelection::SearchService.search_externally(search_term: org_name) + end + + def best_match_from_results(results) + # Find the best match + # (See OrgSelection::SearchService#weigh for how weight is calculated.) + results.find { |r| (r[:weight]).zero? } || results.find { |r| r[:weight] == 1 } + end + + def handle_unmatched_result(org, csv) + puts "⚠️ No results found for Org with id: #{org.id} and name: #{org.name}" + csv << [org.id, org.name, nil, nil, nil, nil] + end + + def handle_matched_result(org, ror, fundref, result, csv) + return unless result[:ror].present? || result[:fundref].present? + + # Save ROR and FUNDREF entries to DB + identifiers = handle_identifiers(org, ror, fundref, result) + # Add entry to generated CSV + csv << [org.id, org.name, result[:name], identifiers[:ror]&.value, identifiers[:fundref]&.value, result[:weight]] + end + + def handle_identifiers(org, ror, fundref, result) + { + ror: handle_identifier(org, ror, result[:ror], result[:name], 'ROR'), + fundref: handle_identifier(org, fundref, result[:fundref], result[:name], 'FUNDREF') + } + end + + def handle_identifier(org, identifier_scheme, id, name, label) + return unless id.present? + + identifier = Identifier.find_or_initialize_by(identifiable: org, + identifier_scheme: identifier_scheme) + begin + identifier.update!(value: "#{identifier_scheme.identifier_prefix}#{id}") + puts "✅ Updated #{org.name} -> #{label}: #{identifier.value}, #{name}" + rescue StandardError => e + message = "❌ Failed to update #{org.name} -> #{label}: #{e.message}" + puts message + Rails.logger.error(message) + end + identifier + end + end +end diff --git a/lib/tasks/orgs.rake b/lib/tasks/orgs.rake index cc12584d1d..661883f0c0 100644 --- a/lib/tasks/orgs.rake +++ b/lib/tasks/orgs.rake @@ -1,124 +1,6 @@ -# frozen_string_literal: true - -CSV_FILE_PATH = Rails.root.join('tmp', 'ror_fundref_ids.csv') -CSV_HEADERS = %w[org_id org_name ror_name ror_id fundref_id weight].freeze - namespace :orgs do desc 'Updates DB and Creates CSV with Org-related ROR/Fundref data' task update_ror_data: :environment do - ror, fundref = fetch_identifier_schemes - # Only proceed if the identifier schemes and the ROR API are all available - return unless ror && fundref && ror_service_available? - - print_intro_message - - CSV.open(CSV_FILE_PATH, 'w', write_headers: true, headers: CSV_HEADERS) do |csv| - org_scope.each do |org| - # If the Org already has a ROR identifier skip it - next if org_has_ror_identifier?(org, ror) - - results = ror_search_results_for_org(org) - result = best_match_from_results(results) if results.any? - if result.present? - handle_matched_result(org, ror, fundref, result, csv) - else - handle_unmatched_result(org, csv) - end - end - end - end - - def fetch_identifier_schemes - ror = IdentifierScheme.find_by(name: 'ror') - fundref = IdentifierScheme.find_by(name: 'fundref') - - if ror.nil? || fundref.nil? - puts "Missing IdentifierScheme(s): ror: #{ror.inspect}, fundref: #{fundref.inspect}" - puts 'Both must exist in DB for this task to run.' - end - [ror, fundref] - end - - def ror_service_available? - ok = ExternalApis::RorService.ping - unless ok - puts 'ROR appears to be offline or your configuration is invalid. ' \ - 'Heartbeat check failed. Refer to the log for more information.' - end - ok - end - - def org_has_ror_identifier?(org, ror) - org.identifiers.any? { |id| id.identifier_scheme_id == ror.id } - end - - def print_intro_message - puts <<~MSG - Scanning ROR for each of your existing Orgs. - The results will be written to "#{CSV_FILE_PATH}" to facilitate#{' '} - review and any corrections that may need to be made. - The CSV file contains the Org name stored in your DB next to the ROR org#{' '} - name that was matched. Use these 2 values to determine if the match was valid. - You can use the ROR search page to find the correct match for any organizations#{' '} - that need to be corrected: https://ror.org/search - - MSG - end - - def org_scope - scope = Org.includes(identifiers: :identifier_scheme) - .where(managed: true, is_other: false) - .order(:name) - puts "Found #{scope.size} org(s) to process." - scope - end - - def ror_search_results_for_org(org) - # The abbreviation sometimes causes weird results so strip it off in this instance - org_name = org.name.gsub(" (#{org.abbreviation})", '') - OrgSelection::SearchService.search_externally(search_term: org_name) - end - - def best_match_from_results(results) - # Find the best match - # (See OrgSelection::SearchService#weigh for how weight is calculated.) - results.find { |r| (r[:weight]).zero? } || results.find { |r| r[:weight] == 1 } - end - - def handle_unmatched_result(org, csv) - puts "⚠️ No results found for Org with id: #{org.id} and name: #{org.name}" - csv << [org.id, org.name, nil, nil, nil, nil] - end - - def handle_matched_result(org, ror, fundref, result, csv) - return unless result[:ror].present? || result[:fundref].present? - - # Save ROR and FUNDREF entries to DB - identifiers = handle_identifiers(org, ror, fundref, result) - # Add entry to generated CSV - csv << [org.id, org.name, result[:name], identifiers[:ror]&.value, identifiers[:fundref]&.value, result[:weight]] - end - - def handle_identifiers(org, ror, fundref, result) - { - ror: handle_identifier(org, ror, result[:ror], result[:name], 'ROR'), - fundref: handle_identifier(org, fundref, result[:fundref], result[:name], 'FUNDREF') - } - end - - def handle_identifier(org, identifier_scheme, id, name, label) - return unless id.present? - - identifier = Identifier.find_or_initialize_by(identifiable: org, - identifier_scheme: identifier_scheme) - begin - identifier.update!(value: "#{identifier_scheme.identifier_prefix}#{id}") - puts "✅ Updated #{org.name} -> #{label}: #{identifier.value}, #{name}" - rescue StandardError => e - message = "❌ Failed to update #{org.name} -> #{label}: #{e.message}" - puts message - Rails.logger.error(message) - end - identifier + Orgs::UpdateRorService.run end end From 22d8f4085180d5a46ca94d913e781a829140014b Mon Sep 17 00:00:00 2001 From: aaronskiba Date: Thu, 18 Sep 2025 10:22:08 -0600 Subject: [PATCH 12/14] Make rubocop happy This refactor and added comments are being made to address rubocop offences. --- app/services/orgs/update_ror_service.rb | 30 +++++++++++++++---------- lib/tasks/orgs.rake | 2 ++ 2 files changed, 20 insertions(+), 12 deletions(-) diff --git a/app/services/orgs/update_ror_service.rb b/app/services/orgs/update_ror_service.rb index 8a4e116e3c..862323800c 100644 --- a/app/services/orgs/update_ror_service.rb +++ b/app/services/orgs/update_ror_service.rb @@ -1,4 +1,8 @@ +# frozen_string_literal: true + module Orgs + # Service object that updates Org records with ROR and FundRef identifiers, and writes results to CSV + # Invoked by the `orgs:update_ror_data` Rake task (lib/tasks/orgs.rake) module UpdateRorService extend self @@ -13,23 +17,25 @@ def run print_intro_message CSV.open(CSV_FILE_PATH, 'w', write_headers: true, headers: CSV_HEADERS) do |csv| - org_scope.each do |org| - # If the Org already has a ROR identifier skip it - next if org_has_ror_identifier?(org, ror) - - results = ror_search_results_for_org(org) - result = best_match_from_results(results) if results.any? - if result.present? - handle_matched_result(org, ror, fundref, result, csv) - else - handle_unmatched_result(org, csv) - end - end + org_scope.each { |org| process_org(org, ror, fundref, csv) } end end private + def process_org(org, ror, fundref, csv) + # If the Org already has a ROR identifier, skip it + return if org_has_ror_identifier?(org, ror) + + results = ror_search_results_for_org(org) + result = best_match_from_results(results) if results.any? + if result.present? + handle_matched_result(org, ror, fundref, result, csv) + else + handle_unmatched_result(org, csv) + end + end + def fetch_identifier_schemes ror = IdentifierScheme.find_by(name: 'ror') fundref = IdentifierScheme.find_by(name: 'fundref') diff --git a/lib/tasks/orgs.rake b/lib/tasks/orgs.rake index 661883f0c0..8db8c6858d 100644 --- a/lib/tasks/orgs.rake +++ b/lib/tasks/orgs.rake @@ -1,3 +1,5 @@ +# frozen_string_literal: true + namespace :orgs do desc 'Updates DB and Creates CSV with Org-related ROR/Fundref data' task update_ror_data: :environment do From 76e309ac9a5f28f8321c9c3b9027702471a41c1d Mon Sep 17 00:00:00 2001 From: aaronskiba Date: Tue, 4 Nov 2025 12:12:55 -0700 Subject: [PATCH 13/14] Add conditional flag to update existing ROR/Fundref data - Add `UPDATE_EXISTING` environment variable to control whether orgs that already have ROR/Fundref identifiers should be updated. - Default behavior remains the same: existing identifiers are skipped. - Rake task usage documented with example: `UPDATE_EXISTING=true bundle exec rake orgs:update_ror_data` - Update `Orgs::UpdateRorService` to accept and utilise the `update_existing` keyword argument. --- app/services/orgs/update_ror_service.rb | 10 +++++----- lib/tasks/orgs.rake | 6 +++++- 2 files changed, 10 insertions(+), 6 deletions(-) diff --git a/app/services/orgs/update_ror_service.rb b/app/services/orgs/update_ror_service.rb index 862323800c..360f640504 100644 --- a/app/services/orgs/update_ror_service.rb +++ b/app/services/orgs/update_ror_service.rb @@ -9,7 +9,7 @@ module UpdateRorService CSV_FILE_PATH = Rails.root.join('tmp', 'ror_fundref_ids.csv') CSV_HEADERS = %w[org_id org_name ror_name ror_id fundref_id weight].freeze - def run + def run(update_existing: false) ror, fundref = fetch_identifier_schemes # Only proceed if the identifier schemes and the ROR API are all available return unless ror && fundref && ror_service_available? @@ -17,15 +17,15 @@ def run print_intro_message CSV.open(CSV_FILE_PATH, 'w', write_headers: true, headers: CSV_HEADERS) do |csv| - org_scope.each { |org| process_org(org, ror, fundref, csv) } + org_scope.each { |org| process_org(org, ror, fundref, csv, update_existing: update_existing) } end end private - def process_org(org, ror, fundref, csv) + def process_org(org, ror, fundref, csv, update_existing: false) # If the Org already has a ROR identifier, skip it - return if org_has_ror_identifier?(org, ror) + return if !update_existing && org_has_ror_identifier?(org, ror) results = ror_search_results_for_org(org) result = best_match_from_results(results) if results.any? @@ -90,7 +90,7 @@ def ror_search_results_for_org(org) def best_match_from_results(results) # Find the best match # (See OrgSelection::SearchService#weigh for how weight is calculated.) - results.find { |r| (r[:weight]).zero? } || results.find { |r| r[:weight] == 1 } + results.find { |r| r[:weight].zero? } || results.find { |r| r[:weight] == 1 } end def handle_unmatched_result(org, csv) diff --git a/lib/tasks/orgs.rake b/lib/tasks/orgs.rake index 8db8c6858d..2cb883dd72 100644 --- a/lib/tasks/orgs.rake +++ b/lib/tasks/orgs.rake @@ -3,6 +3,10 @@ namespace :orgs do desc 'Updates DB and Creates CSV with Org-related ROR/Fundref data' task update_ror_data: :environment do - Orgs::UpdateRorService.run + # By default, existing ROR/Fundref data is not updated. + # - To update existing data, prepend `UPDATE_EXISTING=true` + # - (e.g. `UPDATE_EXISTING=true bundle exec rake orgs:update_ror_data`) + update_existing = ENV['UPDATE_EXISTING'] == 'true' + Orgs::UpdateRorService.run(update_existing: update_existing) end end From 4a813b2e6ec048d92f65cf8be004e1470516c7a7 Mon Sep 17 00:00:00 2001 From: aaronskiba Date: Thu, 20 Nov 2025 14:14:45 -0700 Subject: [PATCH 14/14] Add ror & fundref IdentifierSchemes to db/seeds.rb Used the same values from `task add_new_identifier_schemes` and `task contextualize_identifier_schemes` (see `lib/tasks/upgrade.rake`) for adding seeds. - However, `fundref.identifier_prefix = 'https://doi.org/10.13039/'` doesn't seem to be correct in `lib/tasks/upgrade.rake`. - "https://api.crossref.org/funders/" is used in DMP Assistant's DB, so that was used instead. --- db/seeds.rb | 16 ++++++++++++++++ 1 file changed, 16 insertions(+) diff --git a/db/seeds.rb b/db/seeds.rb index 21f84bdfb2..9d473d4170 100755 --- a/db/seeds.rb +++ b/db/seeds.rb @@ -33,6 +33,22 @@ description: 'Your institutional credentials', active: true, context: 11 + }, + { + name: 'fundref', + description: 'Crossref Funder Registry (FundRef)', + active: true, + identifier_prefix: 'https://api.crossref.org/funders/', + # Only add the :for_orgs context + context: 2 + }, + { + name: 'ror', + description: 'Research Organization Registry (ROR)', + active: true, + identifier_prefix: 'https://ror.org/', + # Only add the :for_orgs context + context: 2 } ] identifier_schemes.each { |is| IdentifierScheme.find_or_create_by(is) }