From 0266e07ff454b2b37150094165168eaaa16d2aa9 Mon Sep 17 00:00:00 2001
From: LE DURAND Matteo <matteo.le-durand@developpement-durable.gouv.fr>
Date: Thu, 13 Mar 2025 10:13:29 +0100
Subject: [PATCH] =?UTF-8?q?changement=20de=20r=C3=A9cup=C3=A9ration,=20mai?=
 =?UTF-8?q?ntenant=20on=20gader=20que=20les=20id=20en=20nombre=20et=20ensu?=
 =?UTF-8?q?ite=20on=20join=20grace=20=C3=A0=20response=20qui=20=C3=A0=20be?=
 =?UTF-8?q?aucoup=20d'info=20comme=20le=20nom,=20le=20groupe=20et=20le=20t?=
 =?UTF-8?q?opics?=
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

---
 dev/gitlab_forge.R | 59 ++++++++++------------------------------------
 1 file changed, 13 insertions(+), 46 deletions(-)

diff --git a/dev/gitlab_forge.R b/dev/gitlab_forge.R
index 3ac686a..bafacb2 100644
--- a/dev/gitlab_forge.R
+++ b/dev/gitlab_forge.R
@@ -155,7 +155,7 @@ get_gitlab_issues <- function(base_url, project_id, private_token) {
   data <- get_all_pages(api_url, private_token)
   data <- data %>%
     mutate(
-      project_name = extract_project_name(web_url),
+      project_name = as.character(project_id),
       type = "issue",
       message = as.character(title),
       author = author.username
@@ -191,7 +191,7 @@ get_gitlab_events <- function(base_url, project_id, private_token) {
       updated_at = created_at,
       author = author.username
     )%>%
-    left_join(name_gitlab, by = c("project_name" = "id")) %>%
+    # left_join(name_gitlab, by = c("project_name" = "id")) %>%
     mutate(project_name = name) %>%
     select(-name)
   return(data %>% select(project_name, type, message, updated_at, author))
@@ -212,29 +212,6 @@ get_data_from_multiple_projects <- function(base_url, project_ids, private_token
   return(bind_rows(all_data))
 }
 
-get_project_topics <- function(base_url, project_id, private_token) {
-  api_url <- paste0(base_url, "/api/v4/projects/", project_id)
-  response <- GET(api_url, add_headers("PRIVATE-TOKEN" = private_token))
-
-  if (status_code(response) == 200) {
-    project_info <- fromJSON(content(response, "text", encoding = "UTF-8"), flatten = TRUE)
-    return(data.frame(
-      id = as.character(project_info$name),
-      topics = paste(project_info$topics, collapse = ", "),  # Combine les topics en une chaîne de caractères
-      stringsAsFactors = FALSE
-    ))
-  } else {
-    warning("Impossible de récupérer les topics pour le projet ID: ", project_id, " - Code: ", status_code(response))
-    return(data.frame(
-      id = project_id,
-      topics = NA,
-      stringsAsFactors = FALSE
-    ))
-  }
-}
-extract_before_at <- function(email) {
-  sub("@.*", "", email)
-}
 ###########################################################
 process_projects <- function(project_ids) {
   result_list <- lapply(project_ids, function(project_id) {
@@ -265,17 +242,14 @@ process_projects <- function(project_ids) {
 
 final_result <- process_projects(project_ids)
 final_result$project_id <- as.character(final_result$project_id)
-final_result <- final_result %>% left_join(name_gitlab, by = c("project_id" = "id")) %>%
-  mutate(project_name = name, type = "commit") %>%
-  select(-name)
-final_result <- final_result %>% mutate(author = extract_before_at(committer_email))
-final_result <- final_result %>% rename( "updated_at" = 'committed_date'  )
+# final_result <- final_result %>% left_join(name_gitlab, by = c("project_id" = "id")) %>%
+#   mutate(project_name = name, type = "commit") %>%
+#   select(-name)
+final_result <- final_result %>% mutate(author = committer_email,type = "commit")
+final_result <- final_result %>% rename( "updated_at" = 'committed_date', "project_name" = 'project_id'  )
 ff <- final_result %>% filter(is_duplicate == FALSE) %>% select(project_name,type , message ,updated_at,author)
 ###########################################################
 
-# Récupérer les topics pour tous les projets
-projects_topics <- bind_rows(lapply(project_ids, get_project_topics, base_url = base_url, private_token = private_token))
-
 # Récupérer les données pour tous les projets
 all_data_forge <- get_data_from_multiple_projects(base_url, project_ids, private_token)
 
@@ -284,26 +258,19 @@ all_data_forge <- all_data_forge %>%
 all_data_forge <- bind_rows(ff,all_data_forge)
 all_data_forge <- all_data_forge %>%
   mutate(origine ="Gitlab_Forge")
-all_data_forge <- left_join(all_data_forge, projects_topics, by = c("project_name" = "id"))
 
 #data.frame avec les groupes et le nom pour left join-----
 response_df <- data.frame(
+  id = as.character(response$id),
   name = response$name,
-  name2 = response$path, #name2 car si le projet à changé de nom alors la jointure ne se fait plus par le nom mais par le passage d'origine ce qui corrige grandement le manque d'information
-  groupe = response$namespace$path
+  groupe = response$namespace$name,
+  topics = sapply(response$topics, function(x) paste(x, collapse = ", ")) # on degroupe et regroupe ?? mais ca fonctionne
 )
 # Joindre les deux jeux de données
 all_data_forge <- all_data_forge %>%
-  left_join(response_df, by = c("project_name" = "name")) # Faire un premier left_join() sur la colonne name
-all_data_forge <- all_data_forge %>% # Compléter les lignes non appariées avec un second left_join() sur la colonne name2
-  left_join(
-    response_df %>%
-      select(name2, groupe),  # On ne conserve que les colonnes utiles
-    by = c("project_name" = "name2"),
-    na_matches = "never" # Empêche l'association des NA
-  ) %>%
-  mutate(groupe = coalesce(groupe.x, groupe.y)) %>%  # Priorité au 1er join
-  select(-groupe.x, -groupe.y)  # Nettoyage des colonnes temporaires
+  left_join(response_df, by = c("project_name" = "id")) # Faire un premier left_join() sur la colonne name
+
 
 # sauvegarde du jeu de donnée-----
+all_data_forge <- all_data_forge %>% select(name , type, message , updated_at , author , topics, origine,groupe)
 save(all_data_forge, file = "gitlab_forge.RData")
-- 
GitLab