Skip to content

Commit

Permalink
[SPARK-28098][SQL]Support read partitioned Hive tables with (#40)
Browse files Browse the repository at this point in the history
  • Loading branch information
catalinii authored Aug 16, 2021
1 parent d0718cb commit b4400c7
Show file tree
Hide file tree
Showing 3 changed files with 42 additions and 2 deletions.
Original file line number Diff line number Diff line change
Expand Up @@ -3040,6 +3040,14 @@ object SQLConf {
.booleanConf
.createWithDefault(false)

val READ_PARTITION_WITH_SUBDIRECTORY_ENABLED =
buildConf("spark.sql.sources.readPartitionWithSubdirectory.enabled")
.doc("When set to true, Spark SQL could read the files of " +
" partitioned hive table from subdirectories under root path of table")
.booleanConf
.createWithDefault(true)


/**
* Holds information about keys that have been deprecated.
*
Expand Down Expand Up @@ -3694,6 +3702,9 @@ class SQLConf extends Serializable with Logging {

def charVarcharAsString: Boolean = getConf(SQLConf.LEGACY_CHAR_VARCHAR_AS_STRING)

def readPartitionWithSubdirectoryEnabled: Boolean =
getConf(READ_PARTITION_WITH_SUBDIRECTORY_ENABLED)

/** ********************** SQLConf functionality methods ************ */

/** Set Spark SQL configuration properties. */
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -59,6 +59,9 @@ class InMemoryFileIndex(
override val rootPaths =
rootPathsSpecified.filterNot(FileStreamSink.ancestorIsMetadataDirectory(_, hadoopConf))

val readPartitionWithSubdirectoryEnabled =
sparkSession.sessionState.conf.readPartitionWithSubdirectoryEnabled

@volatile private var cachedLeafFiles: mutable.LinkedHashMap[Path, FileStatus] = _
@volatile private var cachedLeafDirToChildrenFiles: Map[Path, Array[FileStatus]] = _
@volatile private var cachedPartitionSpec: PartitionSpec = _
Expand Down Expand Up @@ -94,10 +97,23 @@ class InMemoryFileIndex(
val files = listLeafFiles(rootPaths)
cachedLeafFiles =
new mutable.LinkedHashMap[Path, FileStatus]() ++= files.map(f => f.getPath -> f)
cachedLeafDirToChildrenFiles = files.toArray.groupBy(_.getPath.getParent)
cachedLeafDirToChildrenFiles =
if (readPartitionWithSubdirectoryEnabled) {
files.toArray.groupBy(file => getRootPathsLeafDir(file.getPath.getParent))
} else {
files.toArray.groupBy(_.getPath.getParent)
}
cachedPartitionSpec = null
}

private def getRootPathsLeafDir(path: Path): Path = {
if (rootPaths.contains(path)) {
path
} else {
getRootPathsLeafDir(path.getParent)
}
}

override def equals(other: Any): Boolean = other match {
case hdfs: InMemoryFileIndex => rootPaths.toSet == hdfs.rootPaths.toSet
case _ => false
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -25,6 +25,7 @@ import com.google.common.util.concurrent.Striped
import org.apache.hadoop.fs.Path

import org.apache.spark.SparkException
import org.apache.spark.deploy.SparkHadoopUtil
import org.apache.spark.internal.Logging
import org.apache.spark.sql.{AnalysisException, SparkSession}
import org.apache.spark.sql.catalyst.{QualifiedTableName, TableIdentifier}
Expand Down Expand Up @@ -241,7 +242,7 @@ private[hive] class HiveMetastoreCatalog(sparkSession: SparkSession) extends Log
LogicalRelation(
DataSource(
sparkSession = sparkSession,
paths = rootPath.toString :: Nil,
paths = getDirectoryPathSeq(rootPath),
userSpecifiedSchema = Option(updatedTable.dataSchema),
bucketSpec = None,
options = options,
Expand Down Expand Up @@ -277,6 +278,18 @@ private[hive] class HiveMetastoreCatalog(sparkSession: SparkSession) extends Log
result.copy(output = newOutput)
}

private def getDirectoryPathSeq(rootPath: Path): Seq[String] = {
val enableSupportSubDirectories =
sparkSession.sessionState.conf.readPartitionWithSubdirectoryEnabled

if (enableSupportSubDirectories) {
val fs = rootPath.getFileSystem(sparkSession.sessionState.newHadoopConf())
SparkHadoopUtil.get.listLeafDirStatuses(fs, rootPath).map(_.getPath.toString)
} else {
rootPath.toString :: Nil
}
}

private def inferIfNeeded(
relation: HiveTableRelation,
options: Map[String, String],
Expand Down

0 comments on commit b4400c7

Please sign in to comment.