Scala Actors instead of Java Futures
Problem: I need to write an application to process several hundred files, each which will take several hundred megabytes and several seconds to do. I have written it using Future[Report]
objects created using an Executors.newFixedThreadPool()
, but got out-of-memory errors because the List[Future[Report]]
object returned by ExecutorService.invokeAll()
was holding on to the intermediate memory used by each process. I solved the problem by returning the Report
objects from local methods in the processors after calculating the Report
values (only a few hundred lines per Report
) instead of doing the calculations in the call
method (from interface Callable
).
I would like to try solving this using Scala Actors instead. I created a class that takes a sequence of jobs (parameterized types for the jobs, results, and processing function) and processes each in one of a configurable number of Worker
instances (subclass of Actor
). The code follows.
Issues:
I'm not sure that my processing is correct.
I don't like using the
CountDownLatch
to delay returning a result from the dispatcher.I would prefer to write a more "functional" version of the dispatcher that does not modify the
jobsQueue
list orworkers
hashmap, perhaps borrowing the tail-recursiveloop
structure from Clojure (I've used a@tailrec def loop
method in other Scala code).
I am anxiously awaiting the publication of "Actors in Scala" by Philipp Haller and Frank Sommers.
Here is the code:
package multi_worker
import scala.actors.Actor
import java.util.concurrent.CountDownLatch
object MultiWorker {
private val megabyte = 1024 * 1024
private val runtime = Runtime.getRuntime
}
class MultiWorker[A, B](jobs: List[A],
actorCount: Int)(process: (A) => B) {
import MultiWorker._
sealed abstract class Message
// Dispatcher -> Worker: Run this job and report results
case class Process(job: A) extends Message
// Worker -> Dispatcher: Result of processing
case class ReportResult(id: Int, result: B) extends Message
// Worker -> Dispatcher: I need work -- send me a job
case class SendJob(id: Int) extends Message
// Worker -> Dispatcher: I have stopped as requested
case class Stopped(id: Int) extends Message
// Dispatcher -> Worker: Stop working -- all jobs done
case class StopWorking extends Message
/**
* A simple logger that can be sent text messages that will be written to the
* console. Used so that messages from the actors do not step on each other.
*/
object Logger
extends Actor {
def act() {
loop {
react {
case text: String => println(text)
case StopWorking => exit()
}
}
}
}
Logger.start()
/**
* A worker actor that will process jobs and return results to the
* dispatcher.
*/
class Worker(id: Int)
extends Actor{
def act() {
// Ask the dispatcher for an initial job
dispatcher ! SendJob(id)
loop {
react {
case Process(job) =>
val startTime = System.nanoTime
dispatcher ! ReportResult(id, process(job))
val endTime = System.nanoTime
val totalMemory = (runtime.totalMemory / megabyte)
val usedMemory = totalMemory - (runtime.freeMemory / megabyte)
val message = "Finished job " + job + " in " +
((endTime - startTime) / 1000000000.0) +
" seconds using " + usedMemory +
"MB out of total " + totalMemory + "MB"
Logger ! message
disp开发者_如何学JAVAatcher ! SendJob(id)
case StopWorking =>
dispatcher ! Stopped(id)
exit()
}
}
}
}
val latch = new CountDownLatch(1)
var res = List.empty[B]
/**
* The job dispatcher that sends jobs to the worker until the job queue
* (jobs: TraversableOnce[A]) is empty. It then tells the workers to
* stop working and returns the List[B] results to the caller.
*/
val dispatcher = new Actor {
def act() {
var jobQueue = jobs
var workers = (0 until actorCount).map(id => (id, new Worker(id))).toMap
workers.values.foreach(_.start())
loop {
react {
case ReportResult(id, result) =>
res = result :: res
if (jobQueue.isEmpty && workers.isEmpty) {
latch.countDown()
exit()
}
case SendJob(id) =>
if (!jobQueue.isEmpty) {
workers(id) ! Process(jobQueue.head)
jobQueue = jobQueue.tail
}
case Stopped(id) =>
workers = workers - id
}
}
}
}
dispatcher.start()
/**
* Get the results of the processing -- wait for the dispatcher to finish
* before returning.
*/
def results: List[B] = {
latch.await()
res
}
}
After a quick glance, I would propose the following update:
val resultsChannel = new Channel[List[B]] // used instead of countdown latch to get the results
val dispatcher = new Actor {
def act = loop(Nil, (0 to actorCount).map(id =>
(id, new Worker(id).start.asInstanceOf[Worker])).toMap,
Nil)
@tailrec
def loop(jobQueue: List[A], // queue, workers and results are immutable lists, passed recursively through the loop
workers: Map[Int, Worker],
res: List[B]):Unit = react {
case ReportResult(id, result) =>
val results = result :: res
if (results.size == jobs.size) { // when the processing is finished, sends results to the output channel
resultsChannel ! results
}
loop(jobQueue, workers, results)
case SendJob(id) =>
if (!jobQueue.isEmpty) {
workers(id) ! Process(jobQueue.head)
loop(jobQueue.tail, workers, res)
}
case Stopped(id) =>
loop(jobQueue, workers - id, res)
}
}
dispatcher.start()
def results: List[B] = {
resultsChannel.receive {
case results => results // synchronously wait for the data in the channel
}
}
Here is the final version that I came up with (thanks to Vasil Remeniuk). The println
statements tagged with a // DEBUG
comment are to show the progression and the main
method is a unit test:
import scala.actors.Actor
import scala.actors.Channel
import scala.actors.Scheduler
import scala.annotation.tailrec
object MultiWorker {
private val megabyte = 1024 * 1024
private val runtime = Runtime.getRuntime
def main(args: Array[String]) {
val jobs = (0 until 5).map((value: Int) => value).toList
val multiWorker = new MultiWorker[Int, Int](jobs, 2, { value =>
Thread.sleep(100)
println(value)
value
})
println("multiWorker.results: " + multiWorker.results)
Scheduler.shutdown
}
}
class MultiWorker[A, B](jobs: List[A],
actorCount: Int,
process: (A) => B) {
import MultiWorker._
sealed abstract class Message
// Dispatcher -> Worker: Run this job and report results
case class Process(job: A) extends Message
// Worker -> Dispatcher: Result of processing
case class ReportResult(id: Int, result: B) extends Message
// Worker -> Dispatcher: I need work -- send me a job
case class SendJob(id: Int) extends Message
// Worker -> Dispatcher: I have stopped as requested
case class Stopped(id: Int) extends Message
// Dispatcher -> Worker: Stop working -- all jobs done
case class StopWorking() extends Message
/**
* A simple logger that can be sent text messages that will be written to the
* console. Used so that messages from the actors do not step on each other.
*/
object Logger
extends Actor {
def act() {
loop {
react {
case text: String => println(text)
case StopWorking => exit()
}
}
}
}
Logger.start()
/**
* A worker actor that will process jobs and return results to the
* dispatcher.
*/
case class Worker(id: Int)
extends Actor{
def act() {
// Ask the dispatcher for an initial job
dispatcher ! SendJob(id)
loop {
react {
case Process(job) =>
println("Worker(" + id + "): " + Process(job)) // DEBUG
val startTime = System.nanoTime
dispatcher ! ReportResult(id, process(job))
val endTime = System.nanoTime
val totalMemory = (runtime.totalMemory / megabyte)
val usedMemory = totalMemory - (runtime.freeMemory / megabyte)
val message = "Finished job " + job + " in " +
((endTime - startTime) / 1000000000.0) +
" seconds using " + usedMemory +
"MB out of total " + totalMemory + "MB"
Logger ! message
dispatcher ! SendJob(id)
case StopWorking() =>
println("Worker(" + id + "): " + StopWorking()) // DEBUG
dispatcher ! Stopped(id)
exit()
}
}
}
}
val resultsChannel = new Channel[List[B]]
/**
* The job dispatcher that sends jobs to the worker until the job queue
* (jobs: TraversableOnce[A]) is empty. It then tells the workers to
* stop working and returns the List[B] results to the caller.
*/
val dispatcher = new Actor {
def act() {
@tailrec
def loop(jobs: List[A],
workers: Map[Int, Worker],
acc: List[B]) {
println("dispatcher: loop: jobs: " + jobs + ", workers: " + workers + ", acc: " + acc) // DEBUG
if (!workers.isEmpty) { // Stop recursion when there are no more workers
react {
case ReportResult(id, result) =>
println("dispatcher: " + ReportResult(id, result)) // DEBUG
loop(jobs, workers, result :: acc)
case SendJob(id) =>
println("dispatcher: " + SendJob(id)) // DEBUG
if (!jobs.isEmpty) {
println("dispatcher: " + "Sending: " + Process(jobs.head) + " to " + id) // DEBUG
workers(id) ! Process(jobs.head)
loop(jobs.tail, workers, acc)
} else {
println("dispatcher: " + "Sending: " + StopWorking() + " to " + id) // DEBUG
workers(id) ! StopWorking()
loop(Nil, workers, acc)
}
case Stopped(id) =>
println("dispatcher: " + Stopped(id)) // DEBUG
loop(jobs, workers - id, acc)
}
} else {
println("dispatcher: " + "jobs: " + jobs + ", workers: " + workers + ", acc: " + acc) // DEBUG
resultsChannel ! acc
}
}
loop(jobs, (0 until actorCount).map(id => (id, new Worker(id).start.asInstanceOf[Worker])).toMap, Nil)
exit()
}
}.start()
/**
* Get the results of the processing -- wait for the dispatcher to finish
* before returning.
*/
def results: List[B] = {
resultsChannel.receive {
case results => results
}
}
}
精彩评论