From 5b4bccd5032694869b7685389ca1775d21814d91 Mon Sep 17 00:00:00 2001
From: Anders Roxell <anders.roxell@linaro.org>
Date: Tue, 6 May 2025 13:32:56 +0200
Subject: [PATCH] get-lore-repositories: scan_for_repo: add headers User-Agent
 to requests

Signed-off-by: Anders Roxell <anders.roxell@linaro.org>
---
 get-lore-repositories.py | 36 ++++++++++++++----------------------
 1 file changed, 14 insertions(+), 22 deletions(-)

diff --git a/get-lore-repositories.py b/get-lore-repositories.py
index 580799270c8f..e6ea2952f3fe 100755
--- a/get-lore-repositories.py
+++ b/get-lore-repositories.py
@@ -9,6 +9,7 @@
 import argparse
 import logging
 import re
+import requests
 import subprocess as sp
 from pathlib import Path
 from urllib.parse import urlparse
@@ -87,30 +88,21 @@ def git_shallow_clone(tree, tree_base_path, date):
 def scan_for_repo(url, all_repos=False):
     full_url = f"{url}/_/text/mirror/"
     logger.debug(f"{full_url=}")
-    cmd = ["curl", "-s", full_url]
-    try:
-        proc = sp.Popen(cmd, stdout=sp.PIPE, stderr=sp.PIPE)
-        stdout, stderr = proc.communicate()
-        if proc.returncode != 0:
-            logger.error(f"curl failed: {stderr.decode()}")
-            return ""
-
-        urls = []
-        pattern = re.compile(r'href=\"(http://lore\.kernel\.org/\w+/\d+)\"')
-        for line in stdout.decode().splitlines():
-            match = pattern.search(line)
+    headers = {'User-Agent': 'LKFT-get-lore-repositories'}
+    resp = requests.get(full_url, headers=headers)
+    repos = []
+    pattern = re.compile(r'href="(https?://[^"]*/\d+)"')
+    if resp.status_code >= 200 and resp.status_code < 300:
+
+        for line in resp.text.split('\n'):
+            match = re.search(r'href="(https?://[^"]*/\d+)"', line)
             if match:
-                href = match.group(1)
-                if not href.endswith(".git"):
-                    href += ".git"
+                repo_url = match.group(1)
                 if all_repos:
-                    urls.append(href)
-                elif "# newest" in line:
-                    return href
-        return " ".join(urls)
-    except Exception as e:
-        logger.error(f"Failed to run curl: {e}")
-        return ""
+                    repos.append(repo_url)
+                else:
+                    return repo_url
+    return " ".join(repos)
 
 if __name__ == "__main__":
     args = arg_parser().parse_args()
-- 
2.47.2

