Merge pull request #53 from BioPAX/paxtools-6

Paxtools 6
BioPAX · Sep 3, 2023 · ea121c2 · ea121c2
2 parents 006b7f0 + 813f572
commit ea121c2
Show file tree

Hide file tree

Showing 243 changed files with 64,066 additions and 137,935 deletions.
diff --git a/.github/workflows/maven.yml b/.github/workflows/maven.yml
@@ -22,8 +22,8 @@ jobs:
       - uses: actions/checkout@v3
       - uses: actions/setup-java@v3
         with:
-          java-version: 17
+          java-version: 20
           distribution: 'temurin'
           cache: maven
-      - name: Build with Maven (JDK-17)
+      - name: Build with Maven (JDK-20)
         run: mvn --batch-mode --update-snapshots package
diff --git a/README.md b/README.md
@@ -35,20 +35,30 @@ Paxtools provides, beyond the core and converters API, a console application tha
 
 `java -jar paxtools.jar` (add -Xmx option when processing large data files).
 
-If you have [homebrew](http://brew.sh/) installed on your system (Mac OS X), you can install the latest release of Paxtools via the following brew command (there might be old version):
+### JVM options
+
+Paxtools can be build and run with JDK-17 or newer, e.g. Temurin-20. Add these JVM options or set
+`JDK_JAVA_OPTIONS="--add-opens java.base/java.lang=ALL-UNNAMED --add-opens java.base/java.lang.reflect=ALL-UNNAMED"`
+
+ - `-Dpaxtools.model.safeset=list` for read-only BioPAX models/apps, which should consume less RAM.
+ - `-Dpaxtools.normalizer.use-latest-registry=true` (if using Normalizer/Resolver) to use the latest registry.json from bioregistry.io
+
+If you have [homebrew](http://brew.sh/) installed on your system (Mac OS X), you can install the latest release of Paxtools via the following brew command (could be old version):
 
 ```bash
 $ brew install homebrew/science/paxtools
 $ paxtools help
 ```
 
 ## Availability
-* The latest stable Paxtools modules are available in Maven Central
-* [OSSRH public repository](https://oss.sonatype.org/content/groups/public/) (snapshots, since 4.3.1-SNAPSHOT, and releases)
-* Older BioPAX [snapshots](http://www.biopax.org/m2repo/snapshots/) and [releases](http://www.biopax.org/m2repo/releases/) Maven2 repository
-* [Downloads](http://www.biopax.org/downloads/paxtools/)
+* Maven Central repository (only releases)
+* [OSSRH public repository](https://oss.sonatype.org/content/groups/public/) (snapshots since 4.3.1, releases)
+* [old BioPAX snapshots](http://www.biopax.org/m2repo/snapshots/) and [old BioPAX releases](http://www.biopax.org/m2repo/releases/) repositories
+* [BioPAX Downloads](http://www.biopax.org/downloads/paxtools/)
+
+More information about Paxtools can be found in [the publication](http://dx.plos.org/10.1371/journal.pcbi.1003194), 
+[wiki archive](http://www.biopax.org/mediawiki/index.php/Paxtools), 
+and [BioPAX forum](https://groups.google.com/d/msg/biopax-discuss/zwtwDG23T1E/Vu1OK7iXBQAJ).
+
 
-More information about Paxtools can be found in [the publication](http://dx.plos.org/10.1371/journal.pcbi.1003194),  [wiki archive](http://www.biopax.org/mediawiki/index.php/Paxtools), and [BioPAX forum](https://groups.google.com/d/msg/biopax-discuss/zwtwDG23T1E/Vu1OK7iXBQAJ).
 
-UPDATE: can be now build and run with a JDK-18 (you might also need to add these command-line JVM options 
-or set _JAVA_OPTIONS env: --add-opens=java.base/java.lang=ALL-UNNAMED --add-opens=java.base/java.lang.reflect=ALL-UNNAMED ) 
diff --git a/gsea-converter/pom.xml b/gsea-converter/pom.xml
@@ -2,7 +2,7 @@
   <parent>
     <groupId>org.biopax.paxtools</groupId>
     <artifactId>paxtools</artifactId>
-    <version>5.3.1-SNAPSHOT</version>
+    <version>6.0.0-SNAPSHOT</version>
   </parent>
   <modelVersion>4.0.0</modelVersion>
 

diff --git a/gsea-converter/src/main/java/org/biopax/paxtools/io/gsea/GMTConverter.java b/gsea-converter/src/main/java/org/biopax/paxtools/io/gsea/GMTConverter.java
@@ -17,29 +17,23 @@
 import java.util.*;
 
 /**
- * An advanced BioPAX to GMT format converter, which can output IDs of
- * both (or either) genetic elements and chemicals
- * (the output file may be run with the GSEA software if gene/protein IDs are there used).
+ * An advanced BioPAX to GMT format converter, which can output IDs of genetic elements or chemicals
+ * (the output file can be loaded with the GSEA software if gene/protein IDs are used).
  * 
- *     Each output entry (row) consists of three columns (tab separated):
+ * Each output entry (row) consists of three columns (tab separated):
  * name (URI), description, and the list of identifiers (of the same type).
  * For all ERs not associated with any pathway, "other" is used for name and uri.
  *
- *     The "idtype" is what specified by Constructor parameter 'idType'.
- *
- *     The list may have one or more IDs of the same type per PR,
+ * The list may have one or more IDs of the same type per Protein Reference (PR),
  * e.g., UniProt IDs or HGNC Symbols; PRs not having an xref of 
  * given db/id type are ignored. If there are less than three protein 
- * references per entry, it will not be printed.
+ * references per entry in total, it will not be printed.
  *
  * Note, this code assumes that the model has successfully been validated
  * and perhaps normalized (using the BioPAX Validator, Paxtools Normalizer).
  * A BioPAX L1 or L2 model is first converted to the L3.
- *
- * TODO: work in progress; add ER sub-class parameter/filter; consider using PE's xrefs as well... make public.
  */
-final class GMTConverter
-{
+final class GMTConverter {
 	private final static Logger LOG = LoggerFactory.getLogger(GMTConverter.class);
 
 	private final IdFetcher idFetcher;
@@ -112,7 +106,7 @@ public void writeToGSEA(final Model model, OutputStream out) throws IOException
 				if ((minNumIdsPerEntry <= 1 && !entry.identifiers().isEmpty())
 						|| entry.identifiers().size() >= minNumIdsPerEntry)
 				{
-					writer.write(entry.toString() + "\n");
+					writer.write(entry + "\n");
 				}
 			}
 			writer.flush();
@@ -126,12 +120,9 @@ public void writeToGSEA(final Model model, OutputStream out) throws IOException
 	 */
 	public Collection<GMTEntry> convert(final Model model)
 	{
-		final Collection<GMTEntry> toReturn = new TreeSet<GMTEntry>(new Comparator<GMTEntry>() {
-			@Override
-			public int compare(GMTEntry o1, GMTEntry o2) {
-				return o1.toString().compareTo(o2.toString());
-			}
-		});
+		final Collection<GMTEntry> toReturn = new TreeSet<>(
+			Comparator.comparing(GMTEntry::toString)
+		);
 
 		Model l3Model;
 		// convert to level 3 in necessary
@@ -143,9 +134,9 @@ public int compare(GMTEntry o1, GMTEntry o2) {
 		//a modifiable copy of the set of all PRs in the model -
 		//after all, it has all the ERs that do not belong to any pathway
 		final Set<EntityReference> entityReferences =
-				new HashSet<EntityReference>(l3Model.getObjects(EntityReference.class));
+				new HashSet<>(l3Model.getObjects(EntityReference.class));
 
-		final Set<Pathway> pathways = l3Model.getObjects(Pathway.class);
+		final Collection<Pathway> pathways = l3Model.getObjects(Pathway.class);
 		for (Pathway pathway : pathways)
 		{
 			String name = (pathway.getDisplayName() == null) ? pathway.getStandardName() : pathway.getDisplayName();
@@ -156,7 +147,7 @@ public int compare(GMTEntry o1, GMTEntry o2) {
 			final String currentPathwayName = name;
 
 			LOG.debug("Begin converting " + currentPathwayName + " pathway, uri=" + currentPathway.getUri());
-			final Set<EntityReference> ers = new HashSet<EntityReference>();
+			final Set<EntityReference> ers = new HashSet<>();
 			final Traverser traverser = new AbstractTraverser(SimpleEditorMap.L3,
 					Fetcher.nextStepFilter, Fetcher.objectPropertiesOnlyFilter) {
 				@Override
@@ -190,39 +181,43 @@ protected void visit(Object range, BioPAXElement domain, Model model, PropertyEd
 				if(!entries.isEmpty())
 					toReturn.addAll(entries);
 				entityReferences.removeAll(ers);//keep not processed PRs (a PR can be processed multiple times)
-				LOG.debug("- collected " + entries.size() + "entries.");
+				LOG.debug("- collected " + entries.size() + " entries.");
 			}
 		}
 
-		//when there're no pathways, only empty pathays, pathways w/o PRs, then use all/rest of PRs -
+		//when there are no pathways, only empty, or pathways without any PRs, then use the rest of PRs -
 		//organize PRs by species (GSEA s/w can handle only same species identifiers in a data row)
 		if(!entityReferences.isEmpty() && !skipOutsidePathways) {
 			LOG.info("Creating entries for the rest of PRs (outside any pathway)...");
-			toReturn.addAll(createGseaEntries("other","other", getDataSource(l3Model.getObjects(Provenance.class)),entityReferences));
+			toReturn.addAll(createGseaEntries("other","other",
+				getDataSource(l3Model.getObjects(Provenance.class)), entityReferences));
 		}
 
 		return toReturn;
 	}
 
-	private Collection<GMTEntry> createGseaEntries(String uri, final String name, final String dataSource,
-												   final Set<EntityReference> ers)
+	private Collection<GMTEntry> createGseaEntries(String uri, String name,
+																								 String dataSource,
+																								 Collection<EntityReference> ers)
 	{
-		final Collection<GMTEntry> toReturn = new ArrayList<GMTEntry>();
-		GMTEntry entry = new GMTEntry(uri, "", "", String.format("name: %s; datasource: %s",name, dataSource));
-		for (EntityReference er : ers)
+		Collection<GMTEntry> toReturn = new ArrayList<>();
+		GMTEntry entry = new GMTEntry(uri, "", "",
+			String.format("name: %s; datasource: %s", name, dataSource));
+		for (EntityReference er : ers) {
 			entry.identifiers().addAll(idFetcher.fetchID(er));
+		}
 		toReturn.add(entry);
 		return toReturn;
 	}
 
 	/*
 	 * Gets datasource names, if any, in a consistent way/order, excl. duplicates
 	 */
-	private String getDataSource(Set<Provenance> provenances)
+	private String getDataSource(Collection<Provenance> provenances)
 	{
 		if(provenances.isEmpty()) return "N/A";
 
-		Set<String> dsNames = new TreeSet<String>();
+		Set<String> dsNames = new TreeSet<>();
 		for (Provenance provenance : provenances)
 		{
 			String name = provenance.getDisplayName();