Match all regex and sort by match position

ruofeidu · Apr 11, 2024 · 5c51937 · 5c51937
1 parent 6ee2c87
commit 5c51937
Showing 1 changed file with 95 additions and 91 deletions.
diff --git a/DuBibtex.py b/DuBibtex.py
@@ -81,12 +81,12 @@ class Parser:
   # current bibitem and bib ID
   cur, bib = None, ''
 
-  def __init__(self, output_file=None):
+  def __init__(self, output_file=None, use_offline_doi=None):
     config = configparser.ConfigParser()
     config.read("config.ini")
     Paras.header['User-Agent'] = config.get(Paras.section, "header").strip()
     Paras.searchDOI = config.getboolean(Paras.section, "searchDOI")
-    Paras.useOfflineDOI = config.getboolean(Paras.section, "useOfflineDOI")
+    Paras.useOfflineDOI = use_offline_doi if use_offline_doi is not None else config.getboolean(Paras.section, "useOfflineDOI")
     Paras.printSelfInfo = config.getboolean(Paras.section, "printSelfInfo")
     Paras.keepComments = config.getboolean(Paras.section, "keepComments")
     Paras.debugBibCrawler = config.getboolean(Paras.section, "debugBibCrawler")
@@ -350,108 +350,112 @@ def google_lookup(s, parser, use_scholar=False):
   with open('debug.txt', 'w', encoding='utf8') as f:
     f.write(html)
 
-  m = Re.doiAcmUrl.search(html)
-  if m and len(m.groups()) > 0:
-    res = m.groups()[0].replace('\\', '')
-    print("DOI from Google and ACM DOI: %s\n" % res)
-    return res
 
-  # acm = re.compile('citation\.cfm\?id\=([\d\.]+)', flags=re.MULTILINE)
-  m = Re.acm.search(html)
-  if m and len(m.groups()) > 0:
-    # print(m.groups()[0])
-    content_acm = request_url('https://dl.acm.org/citation.cfm?id=%s' %
-                              m.groups()[0])
-    m = Re.doiUrl.search(content_acm, re.M)
+  url_regexes = ['doiAcmUrl', 'acm', 'doiSpringer', 'doiWiley', 'doiUrl', 'ieee', 'doiCaltech', 'doiPubmed']
+
+  found_urls = []
+  for url_regex in url_regexes:
+    m = getattr(Re, url_regex).search(html)
     if m and len(m.groups()) > 0:
-      print(m.groups()[0])
-      res = m.groups()[0]
-      if Paras.debugBibCrawler:
-        print("DOI from Google and ACM CFM: %s\n" % res)
-      return res
-    # content_acm = request_url(
-    #     'https://dl.acm.org/exportformats.cfm?id=%s&expformat=bibtex' %
-    #     m.groups()[0])
-    # m = Re.acmBib.search(content_acm, re.M)
-    # # TODO: month
-    # if m and len(m.groups()) > 0:
-    #   acm_lines = m.groups()[0].splitlines()
-    #   res = ''
-    #   for l in acm_lines:
-    #     if len(l) < 3 or l[0] == '@' or l[0] == '}':
-    #       continue
-    #     mm = Re.item.search(l)
-    #     old_cur = parser.cur.copy()
-    #     if mm and len(mm.groups()) > 0:
-    #       cur_left, cur_right = mm.groups()[0].strip(), mm.groups()[1].strip()
-    #       if cur_left == 'doi':
-    #         res = cur_right
-    #       if cur_left in ['class', 'href', 'doi', 'numpages']:
-    #         continue
-    #       parser.cur[cur_left] = cur_right
-
-    #   dist = levenshtein(old_cur['title'], parser.cur['title'])
-    #   print(dist, old_cur['title'], parser.cur['title'])
-    #   if dist > 2:
-    #     parser.cur = old_cur
-    #     res = ''
-
-    #   if res:
-    #     if Paras.debugBibCrawler:
-    #       print("DOI from Google and ACM BibTeX: %s\n" % res)
-    #     return res
-
-  m = Re.doiSpringer.search(html)
-  if m and len(m.groups()) > 0:
-    res = m.groups()[0].replace('\\', '')
-    print("DOI from Google and Springer: %s\n" % res)
-    return res
+      found_urls.append((url_regex, m, m.start()))
+  # Sort by start position
+  found_urls.sort(key=lambda x: x[2])
 
-  m = Re.doiWiley.search(html)
-  if m and len(m.groups()) > 0:
-    res = m.groups()[0].replace('\\', '')
-    print("DOI from Google and Wiley: %s\n" % res)
-    return res
+  for url_regex, m, _ in found_urls:
 
-  m = Re.doiUrl.search(html, re.M)
-  if m and len(m.groups()) > 0:
-    res = m.groups()[0]
-    if Paras.debugBibCrawler:
-      print("DOI from Google and DOI.org: %s\n" % res)
-    return res
+    if url_regex == 'doiAcmUrl' and m and len(m.groups()) > 0:
+      res = m.groups()[0].replace('\\', '')
+      print("DOI from Google and ACM DOI: %s\n" % res)
+      return res
 
-  m = Re.ieee.search(html)
-  if m and len(m.groups()) > 0:
-    html_ieee = request_url('https://ieeexplore.ieee.org/document/%s' %
-                            m.groups()[0])
-    m = Re.doiJavascript.search(html_ieee, re.M)
-    if m and len(m.groups()) > 0:
+    if url_regex == 'acm' and m and len(m.groups()) > 0:
+      # print(m.groups()[0])
+      content_acm = request_url('https://dl.acm.org/citation.cfm?id=%s' %
+                                m.groups()[0])
+      m = Re.doiUrl.search(content_acm, re.M)
+      if m and len(m.groups()) > 0:
+        print(m.groups()[0])
+        res = m.groups()[0]
+        if Paras.debugBibCrawler:
+          print("DOI from Google and ACM CFM: %s\n" % res)
+        return res
+      # content_acm = request_url(
+      #     'https://dl.acm.org/exportformats.cfm?id=%s&expformat=bibtex' %
+      #     m.groups()[0])
+      # m = Re.acmBib.search(content_acm, re.M)
+      # # TODO: month
+      # if m and len(m.groups()) > 0:
+      #   acm_lines = m.groups()[0].splitlines()
+      #   res = ''
+      #   for l in acm_lines:
+      #     if len(l) < 3 or l[0] == '@' or l[0] == '}':
+      #       continue
+      #     mm = Re.item.search(l)
+      #     old_cur = parser.cur.copy()
+      #     if mm and len(mm.groups()) > 0:
+      #       cur_left, cur_right = mm.groups()[0].strip(), mm.groups()[1].strip()
+      #       if cur_left == 'doi':
+      #         res = cur_right
+      #       if cur_left in ['class', 'href', 'doi', 'numpages']:
+      #         continue
+      #       parser.cur[cur_left] = cur_right
+
+      #   dist = levenshtein(old_cur['title'], parser.cur['title'])
+      #   print(dist, old_cur['title'], parser.cur['title'])
+      #   if dist > 2:
+      #     parser.cur = old_cur
+      #     res = ''
+
+      #   if res:
+      #     if Paras.debugBibCrawler:
+      #       print("DOI from Google and ACM BibTeX: %s\n" % res)
+      #     return res
+
+    if url_regex == 'doiSpringer' and m and len(m.groups()) > 0:
       res = m.groups()[0].replace('\\', '')
-      print("DOI from Google and IEEE: %s\n" % res)
+      print("DOI from Google and Springer: %s\n" % res)
       return res
 
-  m = Re.doiCaltech.search(html)
-  if m and len(m.groups()) > 0:
-    html_cal = request_url('https://authors.library.caltech.edu/%s' %
-                           m.groups()[0])
-    m = Re.doiUrl.search(html_cal, re.M)
-    if m and len(m.groups()) > 0:
-      res = m.groups()[0]
-      res = res.replace('\\', '')
-      print("DOI from Google and Caltech: %s\n" % res)
+    if url_regex == 'doiWiley' and m and len(m.groups()) > 0:
+      res = m.groups()[0].replace('\\', '')
+      print("DOI from Google and Wiley: %s\n" % res)
       return res
 
-  m = Re.doiPubmed.search(html)
-  if m and len(m.groups()) > 0:
-    html_pubmed = request_url('https://www.ncbi.nlm.nih.gov/pubmed/%s' %
-                              m.groups()[0])
-    m = Re.doiUrl.search(html_pubmed, re.M)
-    if m and len(m.groups()) > 0:
+    if url_regex == 'doiUrl' and m and len(m.groups()) > 0:
       res = m.groups()[0]
-      res = res.replace('\\', '')
-      print("DOI from Google and PubMed: %s\n" % res)
+      if Paras.debugBibCrawler:
+        print("DOI from Google and DOI.org: %s\n" % res)
       return res
 
+    if url_regex == 'ieee' and m and len(m.groups()) > 0:
+      html_ieee = request_url('https://ieeexplore.ieee.org/document/%s' %
+                              m.groups()[0])
+      m = Re.doiJavascript.search(html_ieee, re.M)
+      if m and len(m.groups()) > 0:
+        res = m.groups()[0].replace('\\', '')
+        print("DOI from Google and IEEE: %s\n" % res)
+        return res
+
+    if url_regex == 'doiCaltech' and m and len(m.groups()) > 0:
+      html_cal = request_url('https://authors.library.caltech.edu/%s' %
+                            m.groups()[0])
+      m = Re.doiUrl.search(html_cal, re.M)
+      if m and len(m.groups()) > 0:
+        res = m.groups()[0]
+        res = res.replace('\\', '')
+        print("DOI from Google and Caltech: %s\n" % res)
+        return res
+
+    if url_regex == 'doiPubmed' and m  and len(m.groups()) > 0:
+      html_pubmed = request_url('https://www.ncbi.nlm.nih.gov/pubmed/%s' %
+                                m.groups()[0])
+      m = Re.doiUrl.search(html_pubmed, re.M)
+      if m and len(m.groups()) > 0:
+        res = m.groups()[0]
+        res = res.replace('\\', '')
+        print("DOI from Google and PubMed: %s\n" % res)
+        return res
+
   # Nowadays, CVPR papers are hard to fetch DOI without ieee keyword.
   html = request_url('https://www.google.com/search?q=ieee+%s' % s)
   m = Re.ieee.search(html)