Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

reimplement fix for akka/pekko cluster #1594

Merged
merged 15 commits into from
Jan 1, 2025
47 changes: 47 additions & 0 deletions .github/workflows/build-test-prValidation.yml
Original file line number Diff line number Diff line change
Expand Up @@ -102,6 +102,53 @@ jobs:
-Dio.netty.leakDetection.level=PARANOID \
validatePullRequest
pekko-classic-remoting-tests:
name: Pekko Classic Remoting Tests
runs-on: ubuntu-22.04
if: github.repository == 'apache/pekko'
strategy:
fail-fast: false
matrix:
command:
- cluster/test distributed-data/test cluster-tools/test cluster-metrics/test
Copy link
Member

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

need sbt cluster/MultiJvm/test too

Copy link
Contributor Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

#1592 covers that. It looks like running the multiJvm tests will be more complicated. The aim of this small change is that the test modified in this PR runs in the PR CI - because at the momemt, it does not.

steps:
- name: Checkout
uses: actions/checkout@v4
with:
# we don't know what commit the last tag was it's safer to get entire repo so previousStableVersion resolves
fetch-depth: 0
fetch-tags: true

- name: Setup Java 11
uses: actions/setup-java@v4
with:
distribution: temurin
java-version: 11

- name: Install sbt
uses: sbt/setup-sbt@v1

- name: Cache Coursier cache
uses: coursier/cache-action@v6

- name: Enable jvm-opts
run: cp .jvmopts-ci .jvmopts

- name: sbt ${{ matrix.command }}
env:
DEVELOCITY_ACCESS_KEY: ${{ secrets.GE_ACCESS_TOKEN }}
# note that this is not running any multi-jvm tests because multi-in-test=false
run: |-
sbt \
-Djava.security.egd=file:/dev/./urandom \
-Dpekko.remote.artery.enabled=off \
-Dpekko.test.timefactor=2 \
-Dpekko.actor.testkit.typed.timefactor=2 \
-Dpekko.test.tags.exclude=gh-exclude,timing \
-Dpekko.test.multi-in-test=false \
-Dpekko.cluster.assert=on \
clean ${{ matrix.command }}
jdk-21-extra-tests:
name: Java 21 Extra Tests (including all tests that need Java 9+)
runs-on: ubuntu-22.04
Expand Down
29 changes: 17 additions & 12 deletions cluster/src/main/scala/org/apache/pekko/cluster/ClusterDaemon.scala
Original file line number Diff line number Diff line change
Expand Up @@ -13,13 +13,13 @@

package org.apache.pekko.cluster

import scala.annotation.nowarn
import scala.collection.immutable
import scala.concurrent.Future
import scala.concurrent.Promise
import scala.concurrent.duration._
import scala.util.control.NonFatal

import scala.annotation.nowarn
import com.typesafe.config.Config

import org.apache.pekko
Expand All @@ -30,13 +30,11 @@ import pekko.annotation.InternalApi
import pekko.cluster.ClusterEvent._
import pekko.cluster.MemberStatus._
import pekko.dispatch.{ RequiresMessageQueue, UnboundedMessageQueueSemantics }
import pekko.event.ActorWithLogClass
import pekko.event.Logging
import pekko.event.{ ActorWithLogClass, Logging }
import pekko.pattern.ask
import pekko.remote.{ QuarantinedEvent => ClassicQuarantinedEvent }
import pekko.remote.{ QuarantinedEvent => ClassicQuarantinedEvent, RemoteSettings }
import pekko.remote.artery.QuarantinedEvent
import pekko.util.Timeout
import pekko.util.Version
import pekko.util.{ Timeout, Version }

/**
* Base trait for all cluster messages. All ClusterMessage's are serializable.
Expand Down Expand Up @@ -365,6 +363,13 @@ private[cluster] class ClusterCoreDaemon(publisher: ActorRef, joinConfigCompatCh
val statsEnabled = PublishStatsInterval.isFinite
var gossipStats = GossipStats()

val acceptedProtocols: Set[String] = {
val remoteSettings: RemoteSettings = new RemoteSettings(context.system.settings.config)
val initSet = remoteSettings.AcceptProtocolNames
val tcpSet = initSet.map(protocol => s"$protocol.tcp")
initSet ++ tcpSet
}

var seedNodes = SeedNodes
var seedNodeProcess: Option[ActorRef] = None
var seedNodeProcessCounter = 0 // for unique names
Expand Down Expand Up @@ -701,10 +706,10 @@ private[cluster] class ClusterCoreDaemon(publisher: ActorRef, joinConfigCompatCh
* which will reply with a `Welcome` message.
*/
def join(address: Address): Unit = {
if (address.protocol != selfAddress.protocol)
if (!acceptedProtocols.contains(address.protocol))
logWarning(
"Trying to join member with wrong protocol, but was ignored, expected [{}] but was [{}]",
selfAddress.protocol,
"Trying to join member with wrong protocol, but was ignored, expected any of {} but was [{}]",
acceptedProtocols,
address.protocol)
else if (address.system != selfAddress.system)
logWarning(
Expand Down Expand Up @@ -750,10 +755,10 @@ private[cluster] class ClusterCoreDaemon(publisher: ActorRef, joinConfigCompatCh
def joining(joiningNode: UniqueAddress, roles: Set[String], appVersion: Version): Unit = {
if (!preparingForShutdown) {
val selfStatus = latestGossip.member(selfUniqueAddress).status
if (joiningNode.address.protocol != selfAddress.protocol)
if (!acceptedProtocols.contains(joiningNode.address.protocol))
logWarning(
"Member with wrong protocol tried to join, but was ignored, expected [{}] but was [{}]",
selfAddress.protocol,
"Member with wrong protocol tried to join, but was ignored, expected any of {} but was [{}]",
acceptedProtocols,
joiningNode.address.protocol)
else if (joiningNode.address.system != selfAddress.system)
logWarning(
Expand Down
Original file line number Diff line number Diff line change
@@ -0,0 +1,192 @@
/*
* Licensed to the Apache Software Foundation (ASF) under one or more
* contributor license agreements. See the NOTICE file distributed with
* this work for additional information regarding copyright ownership.
* The ASF licenses this file to You under the Apache License, Version 2.0
* (the "License"); you may not use this file except in compliance with
* the License. You may obtain a copy of the License at
*
* http://www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing, software
* distributed under the License is distributed on an "AS IS" BASIS,
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
* See the License for the specific language governing permissions and
* limitations under the License.
*/

package org.apache.pekko.cluster

import com.typesafe.config.{ Config, ConfigFactory }

import org.apache.pekko.testkit.{ LongRunningTest, PekkoSpec }

object MixedProtocolClusterSpec {

val baseConfig: Config =
ConfigFactory.parseString("""
pekko.actor.provider = "cluster"
pekko.coordinated-shutdown.terminate-actor-system = on
pekko.remote.artery.canonical.port = 0
Copy link
Member

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

we need test both pekko.tcp and pekko, I mean both artery and classical transports, seems this only test one.

pekko.remote.classic.netty.tcp.port = 0
pekko.remote.artery.advanced.aeron.idle-cpu-level = 3
pekko.remote.accept-protocol-names = ["pekko", "akka"]
pekko.cluster.jmx.multi-mbeans-in-same-jvm = on
pekko.cluster.configuration-compatibility-check.enforce-on-join = off
""")

val configWithUdp: Config =
ConfigFactory.parseString("""
pekko.remote.artery.transport = "aeron-udp"
""").withFallback(baseConfig)

val configWithPekkoUdp: Config =
ConfigFactory.parseString("""
pekko.remote.protocol-name = "pekko"
""").withFallback(configWithUdp)

val configWithAkkaUdp: Config =
ConfigFactory.parseString("""
pekko.remote.protocol-name = "akka"
""").withFallback(configWithUdp)

val configWithPekkoTcp: Config =
ConfigFactory.parseString("""
pekko.remote.protocol-name = "pekko"
""").withFallback(baseConfig)

val configWithAkkaTcp: Config =
ConfigFactory.parseString("""
pekko.remote.protocol-name = "akka"
""").withFallback(baseConfig)

val configWithNetty: Config =
ConfigFactory.parseString("""
pekko.remote.artery.enabled = false
pekko.remote.classic {
enabled-transports = ["pekko.remote.classic.netty.tcp"]
}
""").withFallback(baseConfig)

val configWithPekkoNetty: Config =
ConfigFactory.parseString("""
pekko.remote.protocol-name = "pekko"
""").withFallback(configWithNetty)

val configWithAkkaNetty: Config =
ConfigFactory.parseString("""
pekko.remote.protocol-name = "akka"
""").withFallback(configWithNetty)
}

class MixedProtocolClusterSpec extends PekkoSpec with ClusterTestKit {

import MixedProtocolClusterSpec._

"A node using the akka protocol" must {

"be allowed to join a cluster with a node using the pekko protocol (udp)" taggedAs LongRunningTest in {

val clusterTestUtil = new ClusterTestUtil(system.name)
try {
// start the first node with the "pekko" protocol
clusterTestUtil.newActorSystem(configWithPekkoUdp)

// have a node using the "akka" protocol join
val joiningNode = clusterTestUtil.newActorSystem(configWithAkkaUdp)
clusterTestUtil.formCluster()

awaitCond(clusterTestUtil.isMemberUp(joiningNode), message = "awaiting joining node to be 'Up'")
} finally {
clusterTestUtil.shutdownAll()
}
}

"be allowed to join a cluster with a node using the pekko protocol (tcp)" taggedAs LongRunningTest in {

val clusterTestUtil = new ClusterTestUtil(system.name)
try {
// start the first node with the "pekko" protocol
clusterTestUtil.newActorSystem(configWithPekkoTcp)

// have a node using the "akka" protocol join
val joiningNode = clusterTestUtil.newActorSystem(configWithAkkaTcp)
clusterTestUtil.formCluster()

awaitCond(clusterTestUtil.isMemberUp(joiningNode), message = "awaiting joining node to be 'Up'")
} finally {
clusterTestUtil.shutdownAll()
}
}

"be allowed to join a cluster with a node using the pekko protocol (netty)" taggedAs LongRunningTest in {

val clusterTestUtil = new ClusterTestUtil(system.name)
try {
// start the first node with the "pekko" protocol
clusterTestUtil.newActorSystem(configWithPekkoNetty)

// have a node using the "akka" protocol join
val joiningNode = clusterTestUtil.newActorSystem(configWithAkkaNetty)
clusterTestUtil.formCluster()

awaitCond(clusterTestUtil.isMemberUp(joiningNode), message = "awaiting joining node to be 'Up'")
} finally {
clusterTestUtil.shutdownAll()
}
}

"allow a node using the pekko protocol to join the cluster (udp)" taggedAs LongRunningTest in {

val clusterTestUtil = new ClusterTestUtil(system.name)
try {
// create the first node with the "akka" protocol
clusterTestUtil.newActorSystem(configWithAkkaUdp)

// have a node using the "pekko" protocol join
val joiningNode = clusterTestUtil.newActorSystem(configWithPekkoUdp)
clusterTestUtil.formCluster()

awaitCond(clusterTestUtil.isMemberUp(joiningNode), message = "awaiting joining node to be 'Up'")
} finally {
clusterTestUtil.shutdownAll()
}
}

"allow a node using the pekko protocol to join the cluster (tcp)" taggedAs LongRunningTest in {

val clusterTestUtil = new ClusterTestUtil(system.name)
try {
// create the first node with the "akka" protocol
clusterTestUtil.newActorSystem(configWithAkkaTcp)

// have a node using the "pekko" protocol join
val joiningNode = clusterTestUtil.newActorSystem(configWithPekkoTcp)
clusterTestUtil.formCluster()

awaitCond(clusterTestUtil.isMemberUp(joiningNode), message = "awaiting joining node to be 'Up'")
} finally {
clusterTestUtil.shutdownAll()
}
}

"allow a node using the pekko protocol to join the cluster (netty)" taggedAs LongRunningTest in {

val clusterTestUtil = new ClusterTestUtil(system.name)
try {
// create the first node with the "akka" protocol
clusterTestUtil.newActorSystem(configWithAkkaNetty)

// have a node using the "pekko" protocol join
val joiningNode = clusterTestUtil.newActorSystem(configWithPekkoNetty)
clusterTestUtil.formCluster()

awaitCond(clusterTestUtil.isMemberUp(joiningNode), message = "awaiting joining node to be 'Up'")
} finally {
clusterTestUtil.shutdownAll()
}
}
}
}
22 changes: 19 additions & 3 deletions remote/src/main/scala/org/apache/pekko/remote/Remoting.scala
Original file line number Diff line number Diff line change
Expand Up @@ -225,11 +225,12 @@ private[remote] class Remoting(_system: ExtendedActorSystem, _provider: RemoteAc
Await.result(addressesPromise.future, StartupTimeout.duration)
if (transports.isEmpty) throw new RemoteTransportException("No transport drivers were loaded.", null)

transportMapping = transports
val mapping = transports
.groupBy {
case (transport, _) => transport.schemeIdentifier
}
.map { case (k, v) => k -> v.toSet }
transportMapping = addProtocolsToMap(mapping)

defaultAddress = transports.head._2
addresses = transports.map { _._2 }.toSet
Expand Down Expand Up @@ -296,6 +297,21 @@ private[remote] class Remoting(_system: ExtendedActorSystem, _provider: RemoteAc
}
}
}

private def addProtocolsToMap(
map: Map[String, Set[(PekkoProtocolTransport, Address)]]): Map[String, Set[(PekkoProtocolTransport, Address)]] = {
if (AcceptProtocolNames.size > 1) {
map.flatMap { case (protocol, transports) =>
val tcpProtocol = protocol.endsWith(".tcp")
AcceptProtocolNames.map { newProtocol =>
if (tcpProtocol)
s"$newProtocol.tcp" -> transports
else
newProtocol -> transports
}
}
} else map
}
}

/**
Expand Down Expand Up @@ -567,7 +583,7 @@ private[remote] class EndpointManager(conf: Config, log: LoggingAdapter)
}

OneForOneStrategy(loggingEnabled = false) {
case InvalidAssociation(localAddress, remoteAddress, reason, disassiciationInfo) =>
case InvalidAssociation(localAddress, remoteAddress, reason, disassociationInfo) =>
keepQuarantinedOr(remoteAddress) {
val causedBy = if (reason.getCause == null) "" else s"Caused by: [${reason.getCause.getMessage}]"
log.warning(
Expand All @@ -580,7 +596,7 @@ private[remote] class EndpointManager(conf: Config, log: LoggingAdapter)
causedBy)
endpoints.markAsFailed(sender(), Deadline.now + settings.RetryGateClosedFor)
}
disassiciationInfo.foreach {
disassociationInfo.foreach {
case AssociationHandle.Quarantined =>
context.system.eventStream.publish(ThisActorSystemQuarantinedEvent(localAddress, remoteAddress))
case _ => // do nothing
Expand Down
Loading