diff --git a/CHANGELOG.md b/CHANGELOG.md index 782e249..5f5c269 100644 --- a/CHANGELOG.md +++ b/CHANGELOG.md @@ -3,7 +3,12 @@ All notable changes to the SOTorrent dataset project will be documented in this --- -## [2018-08-28] - Release for MSR Mining Challenge 2019, based on SO data dump 2018-06-05 +## [2018-09-23] - Second release for MSR Mining Challenge 2019, based on SO data dump 2018-09-05 + +* Update to Stack Overflow data dump 2018-09-05 +* Update `PostReferenceGH` (retrieved on 2018-09-23) + +## [2018-08-28] - First release for MSR Mining Challenge 2019, based on SO data dump 2018-06-05 * Improve URL extraction (e.g., exclude matches in Markdown inline code, exclude invalid links) diff --git a/bigquery/1_extract_so_references.sql b/bigquery/1_extract_so_references.sql index 238c4f5..910847e 100644 --- a/bigquery/1_extract_so_references.sql +++ b/bigquery/1_extract_so_references.sql @@ -1,4 +1,4 @@ ---- Status: 2018-08-28 +--- Status: 2018-09-23 --- Execute this in BigQuery --- select all source code lines of text files that contain a link to Stack Overflow @@ -31,7 +31,7 @@ FROM ( ) WHERE REGEXP_CONTAINS(line, r'(?i:https?://stackoverflow\.com/[^\s)\.\"]*)'); -=> gh_so_references_2018_08_28.matched_lines +=> gh_so_references_2018_09_23.matched_lines --- join with table "files" to get information about repos @@ -44,11 +44,11 @@ SELECT size, url, line -FROM `sotorrent-org.gh_so_references_2018_08_28.matched_lines` as lines +FROM `sotorrent-org.gh_so_references_2018_09_23.matched_lines` as lines LEFT JOIN `bigquery-public-data.github_repos.files` as files ON lines.file_id = files.id; -=> gh_so_references_2018_08_28.matched_files +=> gh_so_references_2018_09_23.matched_files --- normalize the SO links to (http://stackoverflow.com/(a/q)/) @@ -72,9 +72,9 @@ SELECT ELSE url END as url, line -FROM `sotorrent-org.gh_so_references_2018_08_28.matched_files`; +FROM `sotorrent-org.gh_so_references_2018_09_23.matched_files`; -=> gh_so_references_2018_08_28.matched_files_normalized +=> gh_so_references_2018_09_23.matched_files_normalized --- extract post id from links, set post type id, and extract file extension from path @@ -96,11 +96,11 @@ SELECT END as post_type_id, url, line -FROM `sotorrent-org.gh_so_references_2018_08_28.matched_files_normalized` +FROM `sotorrent-org.gh_so_references_2018_09_23.matched_files_normalized` WHERE REGEXP_CONTAINS(url, r'(http:\/\/stackoverflow\.com\/(?:a|q)\/[\d]+)'); -=> gh_so_references_2018_08_28.matched_files_aq +=> gh_so_references_2018_09_23.matched_files_aq --- use camel case for column names, add number of copies, and remove line content for export to MySQL database @@ -108,7 +108,7 @@ WHERE WITH copies AS ( SELECT file_id, count(*) as copies - FROM `sotorrent-org.gh_so_references_2018_08_28.matched_files_aq` + FROM `sotorrent-org.gh_so_references_2018_09_23.matched_files_aq` GROUP BY file_id ) SELECT @@ -123,16 +123,16 @@ SELECT post_type_id as PostTypeId, url as SOUrl, CONCAT('https://raw.githubusercontent.com/', repo_name, "/", branch, "/", path) as GHUrl -FROM `sotorrent-org.gh_so_references_2018_08_28.matched_files_aq` files +FROM `sotorrent-org.gh_so_references_2018_09_23.matched_files_aq` files JOIN copies ON files.file_id = copies.file_id; -=> gh_so_references_2018_08_28.PostReferenceGH +=> gh_so_references_2018_09_23.PostReferenceGH ################################################################### -# the following tables are not present in gh_so_references_2018_08_28 +# the following tables are not present in gh_so_references_2018_09_23 # will only be created on demand ################################################################### @@ -155,7 +155,7 @@ WITH parent_id as ParentId, SOUrl, GHUrl - FROM `sotorrent-org.gh_so_references_2018_08_28.PostReferenceGH` ref + FROM `sotorrent-org.gh_so_references_2018_09_23.PostReferenceGH` ref LEFT JOIN `bigquery-public-data.stackoverflow.posts_answers` a ON ref.PostId = a.id WHERE PostTypeId=2 @@ -180,7 +180,7 @@ FROM answers LEFT JOIN `bigquery-public-data.stackoverflow.posts_questions` q ON answers.ParentId = q.id; -=> gh_so_references_2018_08_28.PostReferenceGH_Answers +=> gh_so_references_2018_09_23.PostReferenceGH_Answers #standardSQL @@ -192,9 +192,9 @@ SELECT CommentCount, Score, ParentViewCount -FROM `sotorrent-org.gh_so_references_2018_08_28.PostReferenceGH_Answers`; +FROM `sotorrent-org.gh_so_references_2018_09_23.PostReferenceGH_Answers`; -=> gh_so_references_2018_08_28.PostReferenceGH_Answers_R +=> gh_so_references_2018_09_23.PostReferenceGH_Answers_R --- retrieve info about referenced SO questions @@ -214,12 +214,12 @@ SELECT view_count as ViewCount, SOUrl, GHUrl -FROM `sotorrent-org.gh_so_references_2018_08_28.PostReferenceGH` ref +FROM `sotorrent-org.gh_so_references_2018_09_23.PostReferenceGH` ref LEFT JOIN `bigquery-public-data.stackoverflow.posts_questions` q ON ref.PostId = q.id WHERE PostTypeId=1; -=> gh_so_references_2018_08_28.PostReferenceGH_Questions +=> gh_so_references_2018_09_23.PostReferenceGH_Questions #standardSQL @@ -231,6 +231,6 @@ SELECT CommentCount, Score, ViewCount -FROM `sotorrent-org.gh_so_references_2018_08_28.PostReferenceGH_Questions`; +FROM `sotorrent-org.gh_so_references_2018_09_23.PostReferenceGH_Questions`; -=> gh_so_references_2018_08_28.PostReferenceGH_Questions_R +=> gh_so_references_2018_09_23.PostReferenceGH_Questions_R diff --git a/sotorrent/README.md b/sotorrent/README.md index e3db89d..5bbdbb5 100644 --- a/sotorrent/README.md +++ b/sotorrent/README.md @@ -12,6 +12,6 @@ ## Data -The Stack Overflow data has been extracted from the official [Stack Exchange data dump](https://archive.org/details/stackexchange) released 2018-06-05. +The Stack Overflow data has been extracted from the official [Stack Exchange data dump](https://archive.org/details/stackexchange) released 2018-09-05. -The GitHub references have been retrieved from the [Google BigQuery GitHub data set](https://cloud.google.com/bigquery/public-data/github) on 2018-08-28. +The GitHub references have been retrieved from the [Google BigQuery GitHub data set](https://cloud.google.com/bigquery/public-data/github) on 2018-09-23.